{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.2315886984715146,
  "eval_steps": 500,
  "global_step": 5000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 175.375,
      "completions/mean_terminated_length": 175.375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.23963595181703568,
      "epoch": 4.631773969430292e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 41958.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 1,
      "step_time": 21.253444358706474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 134.875,
      "completions/mean_terminated_length": 134.875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2839031368494034,
      "epoch": 9.263547938860583e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 9.99990736452061e-07,
      "loss": 0.0,
      "num_tokens": 68964.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2,
      "step_time": 16.527244716882706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 134.9375,
      "completions/mean_terminated_length": 134.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.27176588773727417,
      "epoch": 0.00013895321908290875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 9.999814729041222e-07,
      "loss": 0.0,
      "num_tokens": 91523.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3,
      "step_time": 15.215722694993019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 131.5,
      "completions/mean_terminated_length": 131.5,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.19801294803619385,
      "epoch": 0.00018527095877721167,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 9.999722093561833e-07,
      "loss": 0.0,
      "num_tokens": 113883.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4,
      "step_time": 14.227380692958832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 174.6875,
      "completions/mean_terminated_length": 174.6875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3308049142360687,
      "epoch": 0.0002315886984715146,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 9.999629458082445e-07,
      "loss": 0.0,
      "num_tokens": 147846.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 5,
      "step_time": 20.853050660341978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 182.6875,
      "completions/mean_terminated_length": 182.6875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3976907953619957,
      "epoch": 0.0002779064381658175,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06878510117530823,
      "kl": 0.0,
      "learning_rate": 9.999536822603056e-07,
      "loss": 0.0094,
      "num_tokens": 174929.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 6,
      "step_time": 20.855298921465874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 203.9375,
      "completions/mean_terminated_length": 203.9375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.4255857914686203,
      "epoch": 0.0003242241778601204,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001779966289177537,
      "kl": 0.002432417415548116,
      "learning_rate": 9.999444187123667e-07,
      "loss": 0.0001,
      "num_tokens": 202720.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 7,
      "step_time": 21.188715610653162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 146.125,
      "completions/mean_terminated_length": 146.125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.34608907252550125,
      "epoch": 0.00037054191755442334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007893778383731842,
      "kl": 0.0012565676588565111,
      "learning_rate": 9.99935155164428e-07,
      "loss": 0.0001,
      "num_tokens": 226850.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 8,
      "step_time": 17.182187285274267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 126.6875,
      "completions/mean_terminated_length": 126.6875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2665010169148445,
      "epoch": 0.00041685965724872626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009253919124603271,
      "kl": 0.0012993732816539705,
      "learning_rate": 9.999258916164892e-07,
      "loss": 0.0001,
      "num_tokens": 248829.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 9,
      "step_time": 14.068961184471846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.38808972388505936,
      "epoch": 0.0004631773969430292,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020301910117268562,
      "kl": 0.002197462454205379,
      "learning_rate": 9.999166280685503e-07,
      "loss": 0.0001,
      "num_tokens": 297489.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 10,
      "step_time": 22.830876268446445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 199.0,
      "completions/mean_terminated_length": 199.0,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.16765369474887848,
      "epoch": 0.0005094951366373321,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005631350795738399,
      "kl": 0.0007857431482989341,
      "learning_rate": 9.999073645206112e-07,
      "loss": 0.0,
      "num_tokens": 320465.0,
      "reward": 0.9084742069244385,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9084742069244385,
      "rewards/reward_func/std": 0.0,
      "step": 11,
      "step_time": 19.6044539809227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 148.5,
      "completions/mean_terminated_length": 148.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3356877490878105,
      "epoch": 0.000555812876331635,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001956203021109104,
      "kl": 0.0015761206741444767,
      "learning_rate": 9.998981009726725e-07,
      "loss": 0.0001,
      "num_tokens": 341017.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 12,
      "step_time": 16.55023478344083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 237.375,
      "completions/mean_terminated_length": 237.375,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.2522463947534561,
      "epoch": 0.0006021306160259379,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005217111902311444,
      "kl": 0.0008352900767931715,
      "learning_rate": 9.998888374247337e-07,
      "loss": 0.0,
      "num_tokens": 363679.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 13,
      "step_time": 22.73781155049801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.35162051767110825,
      "epoch": 0.0006484483557202408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006630179705098271,
      "kl": 0.0011639624572126195,
      "learning_rate": 9.998795738767948e-07,
      "loss": 0.0001,
      "num_tokens": 386353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 14,
      "step_time": 19.6433484852314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 239.5625,
      "completions/mean_terminated_length": 239.5625,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "entropy": 0.16075202450156212,
      "epoch": 0.0006947660954145438,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010145703563466668,
      "kl": 0.0008467905863653868,
      "learning_rate": 9.99870310328856e-07,
      "loss": 0.0,
      "num_tokens": 411306.0,
      "reward": 0.7784501910209656,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7784501910209656,
      "rewards/reward_func/std": 0.0,
      "step": 15,
      "step_time": 22.901156540960073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 125.625,
      "completions/mean_terminated_length": 125.625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2599363848567009,
      "epoch": 0.0007410838351088467,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00056539720389992,
      "kl": 0.0009439134591957554,
      "learning_rate": 9.99861046780917e-07,
      "loss": 0.0,
      "num_tokens": 434612.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 16,
      "step_time": 14.680199645459652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 130.8125,
      "completions/mean_terminated_length": 130.8125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2634947672486305,
      "epoch": 0.0007874015748031496,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.024002104997634888,
      "kl": 0.003976634790888056,
      "learning_rate": 9.998517832329782e-07,
      "loss": 0.0002,
      "num_tokens": 456801.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 17,
      "step_time": 14.279331889003515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 163.1875,
      "completions/mean_terminated_length": 163.1875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4323969930410385,
      "epoch": 0.0008337193144974525,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002135026967152953,
      "kl": 0.0015527470386587083,
      "learning_rate": 9.998425196850393e-07,
      "loss": 0.0001,
      "num_tokens": 492628.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 18,
      "step_time": 21.967232834547758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 211.75,
      "completions/mean_terminated_length": 211.75,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.13712666556239128,
      "epoch": 0.0008800370541917554,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003543617494869977,
      "kl": 0.0005839392761117779,
      "learning_rate": 9.998332561371004e-07,
      "loss": 0.0,
      "num_tokens": 519328.0,
      "reward": 0.7462587356567383,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7462587356567383,
      "rewards/reward_func/std": 0.0,
      "step": 19,
      "step_time": 21.239732574671507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 123.5,
      "completions/mean_terminated_length": 123.5,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.24722861126065254,
      "epoch": 0.0009263547938860583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014625731855630875,
      "kl": 0.0013496327155735344,
      "learning_rate": 9.998239925891615e-07,
      "loss": 0.0001,
      "num_tokens": 538920.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 20,
      "step_time": 13.600695561617613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 124.1875,
      "completions/mean_terminated_length": 124.1875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.27628303319215775,
      "epoch": 0.0009726725335803613,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001170438015833497,
      "kl": 0.001314763183472678,
      "learning_rate": 9.998147290412229e-07,
      "loss": 0.0001,
      "num_tokens": 561419.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 21,
      "step_time": 14.672110460698605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 153.1875,
      "completions/mean_terminated_length": 153.1875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.1855221688747406,
      "epoch": 0.0010189902732746642,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012163659557700157,
      "kl": 0.0011993444641120732,
      "learning_rate": 9.99805465493284e-07,
      "loss": 0.0001,
      "num_tokens": 583918.0,
      "reward": 0.3678794503211975,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3678794503211975,
      "rewards/reward_func/std": 0.0,
      "step": 22,
      "step_time": 17.95040387660265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 201.0625,
      "completions/mean_terminated_length": 201.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.23063024878501892,
      "epoch": 0.001065308012968967,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008595545659773052,
      "kl": 0.0009587571257725358,
      "learning_rate": 9.99796201945345e-07,
      "loss": 0.0,
      "num_tokens": 621839.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 23,
      "step_time": 23.448596190661192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 247.9375,
      "completions/mean_terminated_length": 247.9375,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "entropy": 0.19920672848820686,
      "epoch": 0.00111162575266327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005527918692678213,
      "kl": 0.0008416420023422688,
      "learning_rate": 9.99786938397406e-07,
      "loss": 0.0,
      "num_tokens": 660942.0,
      "reward": 0.8787640929222107,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8787640929222107,
      "rewards/reward_func/std": 0.0,
      "step": 24,
      "step_time": 28.79375683888793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 216.5,
      "completions/mean_terminated_length": 216.5,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.34480341523885727,
      "epoch": 0.001157943492357573,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.057788483798503876,
      "kl": 0.0012676734913839027,
      "learning_rate": 9.997776748494674e-07,
      "loss": -0.1737,
      "num_tokens": 682742.0,
      "reward": 0.24754419922828674,
      "reward_std": 0.4429076910018921,
      "rewards/reward_func/mean": 0.24754419922828674,
      "rewards/reward_func/std": 0.4429076910018921,
      "step": 25,
      "step_time": 24.72364231571555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 180.25,
      "completions/mean_terminated_length": 180.25,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.38395222276449203,
      "epoch": 0.0012042612320518759,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09764150530099869,
      "kl": 0.001582463999511674,
      "learning_rate": 9.997684113015285e-07,
      "loss": -0.0923,
      "num_tokens": 711834.0,
      "reward": 0.01261853240430355,
      "reward_std": 0.0504741296172142,
      "rewards/reward_func/mean": 0.01261853240430355,
      "rewards/reward_func/std": 0.0504741333425045,
      "step": 26,
      "step_time": 22.57013550028205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 169.375,
      "completions/mean_terminated_length": 169.375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3620755672454834,
      "epoch": 0.0012505789717461788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007554744952358305,
      "kl": 0.0011183345050085336,
      "learning_rate": 9.997591477535896e-07,
      "loss": 0.0001,
      "num_tokens": 741312.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 27,
      "step_time": 19.41274269670248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 122.0,
      "completions/max_terminated_length": 122.0,
      "completions/mean_length": 110.1875,
      "completions/mean_terminated_length": 110.1875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.2566985860466957,
      "epoch": 0.0012968967114404817,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001454993849620223,
      "kl": 0.0015755675849504769,
      "learning_rate": 9.997498842056508e-07,
      "loss": 0.0001,
      "num_tokens": 761507.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 28,
      "step_time": 12.01841538771987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 171.0,
      "completions/mean_terminated_length": 171.0,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2279587686061859,
      "epoch": 0.0013432144511347846,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07249391824007034,
      "kl": 0.0012631421268451959,
      "learning_rate": 9.997406206577119e-07,
      "loss": -0.0367,
      "num_tokens": 790355.0,
      "reward": 0.8854333758354187,
      "reward_std": 0.20494304597377777,
      "rewards/reward_func/mean": 0.8854333758354187,
      "rewards/reward_func/std": 0.20494303107261658,
      "step": 29,
      "step_time": 20.076055269688368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 110.0,
      "completions/mean_terminated_length": 110.0,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "entropy": 0.27383508533239365,
      "epoch": 0.0013895321908290875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010895292507484555,
      "kl": 0.0013592140458058566,
      "learning_rate": 9.99731357109773e-07,
      "loss": 0.0001,
      "num_tokens": 810163.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 30,
      "step_time": 12.570972047746181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 153.3125,
      "completions/mean_terminated_length": 153.3125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.26810621470212936,
      "epoch": 0.0014358499305233904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00047116258065216243,
      "kl": 0.0006794522632844746,
      "learning_rate": 9.997220935618341e-07,
      "loss": 0.0,
      "num_tokens": 832680.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 31,
      "step_time": 16.420401941984892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 186.5625,
      "completions/mean_terminated_length": 186.5625,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.1992652639746666,
      "epoch": 0.0014821676702176934,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00044166221050545573,
      "kl": 0.0008906413859222084,
      "learning_rate": 9.997128300138952e-07,
      "loss": 0.0,
      "num_tokens": 855585.0,
      "reward": 0.795669436454773,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.795669436454773,
      "rewards/reward_func/std": 0.0,
      "step": 32,
      "step_time": 18.911770571023226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 282.375,
      "completions/mean_terminated_length": 282.375,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "entropy": 0.24813947081565857,
      "epoch": 0.0015284854099119963,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003291643224656582,
      "kl": 0.0007134305778890848,
      "learning_rate": 9.997035664659564e-07,
      "loss": 0.0,
      "num_tokens": 883431.0,
      "reward": 0.8668729066848755,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8668729066848755,
      "rewards/reward_func/std": 0.0,
      "step": 33,
      "step_time": 26.373576171696186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 305.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 216.625,
      "completions/mean_terminated_length": 216.625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4462438002228737,
      "epoch": 0.0015748031496062992,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05519348755478859,
      "kl": 0.001218096585944295,
      "learning_rate": 9.996943029180175e-07,
      "loss": 0.0004,
      "num_tokens": 909393.0,
      "reward": 0.16718508303165436,
      "reward_std": 0.29906976222991943,
      "rewards/reward_func/mean": 0.16718508303165436,
      "rewards/reward_func/std": 0.2990697920322418,
      "step": 34,
      "step_time": 26.19732155278325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 168.75,
      "completions/mean_terminated_length": 168.75,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.20621728152036667,
      "epoch": 0.001621120889300602,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017018754733726382,
      "kl": 0.0012577880843309686,
      "learning_rate": 9.996850393700786e-07,
      "loss": 0.0001,
      "num_tokens": 930877.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 35,
      "step_time": 19.29100165143609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 179.875,
      "completions/mean_terminated_length": 179.875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.14232603088021278,
      "epoch": 0.001667438628994905,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003708863805513829,
      "kl": 0.0005206533242017031,
      "learning_rate": 9.996757758221397e-07,
      "loss": 0.0,
      "num_tokens": 963595.0,
      "reward": 0.8702397346496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8702397346496582,
      "rewards/reward_func/std": 0.0,
      "step": 36,
      "step_time": 21.33883025869727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 237.375,
      "completions/mean_terminated_length": 237.375,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.20478082448244095,
      "epoch": 0.001713756368689208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004962633247487247,
      "kl": 0.0007221920968731865,
      "learning_rate": 9.996665122742009e-07,
      "loss": 0.0,
      "num_tokens": 987441.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 37,
      "step_time": 22.624291632324457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 353.0,
      "completions/max_terminated_length": 353.0,
      "completions/mean_length": 284.375,
      "completions/mean_terminated_length": 284.375,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "entropy": 0.21290477737784386,
      "epoch": 0.0017600741083835109,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04010778293013573,
      "kl": 0.0008870865567587316,
      "learning_rate": 9.996572487262622e-07,
      "loss": -0.0015,
      "num_tokens": 1020295.0,
      "reward": 0.9311375617980957,
      "reward_std": 0.009968340396881104,
      "rewards/reward_func/mean": 0.9311375617980957,
      "rewards/reward_func/std": 0.009968344122171402,
      "step": 38,
      "step_time": 32.43071475997567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 171.8125,
      "completions/mean_terminated_length": 171.8125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.4042629078030586,
      "epoch": 0.0018063918480778138,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006234828033484519,
      "kl": 0.0012812706409022212,
      "learning_rate": 9.996479851783233e-07,
      "loss": 0.0001,
      "num_tokens": 1061252.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 39,
      "step_time": 23.445143539458513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 187.625,
      "completions/mean_terminated_length": 187.625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.4109330251812935,
      "epoch": 0.0018527095877721167,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06005103141069412,
      "kl": 0.0015019657730590552,
      "learning_rate": 9.996387216303845e-07,
      "loss": 0.0871,
      "num_tokens": 1083774.0,
      "reward": 0.4600222110748291,
      "reward_std": 0.47510889172554016,
      "rewards/reward_func/mean": 0.4600222110748291,
      "rewards/reward_func/std": 0.47510892152786255,
      "step": 40,
      "step_time": 21.93445473909378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 254.4375,
      "completions/mean_terminated_length": 254.4375,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "entropy": 0.19949688762426376,
      "epoch": 0.0018990273274664196,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003561973571777344,
      "kl": 0.0006408548942999914,
      "learning_rate": 9.996294580824454e-07,
      "loss": 0.0,
      "num_tokens": 1118309.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 41,
      "step_time": 27.413367446511984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 335.0,
      "completions/max_terminated_length": 335.0,
      "completions/mean_length": 246.625,
      "completions/mean_terminated_length": 246.625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3479044884443283,
      "epoch": 0.0019453450671607225,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05840029940009117,
      "kl": 0.0015336884825956076,
      "learning_rate": 9.996201945345067e-07,
      "loss": -0.1468,
      "num_tokens": 1145231.0,
      "reward": 0.6128644347190857,
      "reward_std": 0.4902915358543396,
      "rewards/reward_func/mean": 0.6128644347190857,
      "rewards/reward_func/std": 0.490291565656662,
      "step": 42,
      "step_time": 28.557862129062414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 173.25,
      "completions/mean_terminated_length": 173.25,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.29058726504445076,
      "epoch": 0.0019916628068550254,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07597646862268448,
      "kl": 0.0010675753292161971,
      "learning_rate": 9.996109309865678e-07,
      "loss": 0.0587,
      "num_tokens": 1165987.0,
      "reward": 0.2719818353652954,
      "reward_std": 0.07377496361732483,
      "rewards/reward_func/mean": 0.2719818353652954,
      "rewards/reward_func/std": 0.07377497106790543,
      "step": 43,
      "step_time": 18.770597979426384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 223.625,
      "completions/mean_terminated_length": 223.625,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.25469207763671875,
      "epoch": 0.0020379805465493284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000462218071334064,
      "kl": 0.0010333134850952774,
      "learning_rate": 9.99601667438629e-07,
      "loss": 0.0001,
      "num_tokens": 1195453.0,
      "reward": 0.30568957328796387,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.30568957328796387,
      "rewards/reward_func/std": 0.0,
      "step": 44,
      "step_time": 23.699180126190186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 128.625,
      "completions/mean_terminated_length": 128.625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.32476720958948135,
      "epoch": 0.0020842982862436313,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007233992218971252,
      "kl": 0.0013380103919189423,
      "learning_rate": 9.9959240389069e-07,
      "loss": 0.0001,
      "num_tokens": 1216535.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 45,
      "step_time": 15.907741725444794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 219.75,
      "completions/mean_terminated_length": 219.75,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.46384233236312866,
      "epoch": 0.002130616025937934,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005010095192119479,
      "kl": 0.0013451931299641728,
      "learning_rate": 9.995831403427512e-07,
      "loss": 0.0001,
      "num_tokens": 1243331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 46,
      "step_time": 23.114768505096436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 229.3125,
      "completions/mean_terminated_length": 229.3125,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.31266774982213974,
      "epoch": 0.002176933765632237,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006749372696503997,
      "kl": 0.0010217149829259142,
      "learning_rate": 9.995738767948123e-07,
      "loss": 0.0001,
      "num_tokens": 1271976.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 47,
      "step_time": 24.77662756666541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 178.1875,
      "completions/mean_terminated_length": 178.1875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.32404256612062454,
      "epoch": 0.00222325150532654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008625648333691061,
      "kl": 0.0012001528666587546,
      "learning_rate": 9.995646132468735e-07,
      "loss": 0.0001,
      "num_tokens": 1303483.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 48,
      "step_time": 21.40724455565214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 116.3125,
      "completions/mean_terminated_length": 116.3125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.29034823924303055,
      "epoch": 0.002269569245020843,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007067320984788239,
      "kl": 0.0011181181180290878,
      "learning_rate": 9.995553496989346e-07,
      "loss": 0.0001,
      "num_tokens": 1324576.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 49,
      "step_time": 14.518561020493507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3400954380631447,
      "epoch": 0.002315886984715146,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008458670345135033,
      "kl": 0.0010712836519815028,
      "learning_rate": 9.995460861509957e-07,
      "loss": 0.0001,
      "num_tokens": 1360472.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 50,
      "step_time": 17.930652901530266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 174.125,
      "completions/mean_terminated_length": 174.125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.27026674151420593,
      "epoch": 0.002362204724409449,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06760480254888535,
      "kl": 0.0009809674374992028,
      "learning_rate": 9.995368226030568e-07,
      "loss": -0.0271,
      "num_tokens": 1391882.0,
      "reward": 0.9607253074645996,
      "reward_std": 0.027347400784492493,
      "rewards/reward_func/mean": 0.9607253074645996,
      "rewards/reward_func/std": 0.027347411960363388,
      "step": 51,
      "step_time": 19.771480850875378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 372.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 219.8125,
      "completions/mean_terminated_length": 219.8125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4704762101173401,
      "epoch": 0.0024085224641037517,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003276088973507285,
      "kl": 0.0027438767137937248,
      "learning_rate": 9.995275590551182e-07,
      "loss": 0.0001,
      "num_tokens": 1420887.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 52,
      "step_time": 31.42541616410017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 193.3125,
      "completions/mean_terminated_length": 193.3125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.41140784323215485,
      "epoch": 0.0024548402037980546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007755675469525158,
      "kl": 0.0015211344871204346,
      "learning_rate": 9.995182955071793e-07,
      "loss": 0.0001,
      "num_tokens": 1468060.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 53,
      "step_time": 25.38789566233754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 167.9375,
      "completions/mean_terminated_length": 167.9375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.27289799973368645,
      "epoch": 0.0025011579434923575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08337947726249695,
      "kl": 0.0012199645425425842,
      "learning_rate": 9.995090319592402e-07,
      "loss": 0.0885,
      "num_tokens": 1491323.0,
      "reward": 0.7094695568084717,
      "reward_std": 0.42304593324661255,
      "rewards/reward_func/mean": 0.7094695568084717,
      "rewards/reward_func/std": 0.42304593324661255,
      "step": 54,
      "step_time": 19.423756692558527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 204.5625,
      "completions/mean_terminated_length": 204.5625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.3171669542789459,
      "epoch": 0.0025474756831866605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005079456022940576,
      "kl": 0.0010534442990319803,
      "learning_rate": 9.994997684113015e-07,
      "loss": 0.0001,
      "num_tokens": 1514996.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 55,
      "step_time": 21.686194479465485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 175.3125,
      "completions/mean_terminated_length": 175.3125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.20267057418823242,
      "epoch": 0.0025937934228809634,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09139242023229599,
      "kl": 0.001020463663735427,
      "learning_rate": 9.994905048633627e-07,
      "loss": -0.0276,
      "num_tokens": 1552345.0,
      "reward": 0.873737096786499,
      "reward_std": 0.08094866573810577,
      "rewards/reward_func/mean": 0.873737096786499,
      "rewards/reward_func/std": 0.08094867318868637,
      "step": 56,
      "step_time": 22.00080531463027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 220.125,
      "completions/mean_terminated_length": 220.125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.30425940454006195,
      "epoch": 0.0026401111625752663,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04377205669879913,
      "kl": 0.0012595587322721258,
      "learning_rate": 9.994812413154238e-07,
      "loss": 0.0086,
      "num_tokens": 1575275.0,
      "reward": 0.0071022482588887215,
      "reward_std": 0.0018939328147098422,
      "rewards/reward_func/mean": 0.0071022482588887215,
      "rewards/reward_func/std": 0.001893932931125164,
      "step": 57,
      "step_time": 22.13108843192458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 291.0,
      "completions/max_terminated_length": 291.0,
      "completions/mean_length": 265.375,
      "completions/mean_terminated_length": 265.375,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "entropy": 0.2432107925415039,
      "epoch": 0.002686428902269569,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08007922023534775,
      "kl": 0.001151241856859997,
      "learning_rate": 9.99471977767485e-07,
      "loss": -0.0175,
      "num_tokens": 1600033.0,
      "reward": 0.9758262634277344,
      "reward_std": 0.043243277817964554,
      "rewards/reward_func/mean": 0.9758262634277344,
      "rewards/reward_func/std": 0.04324327036738396,
      "step": 58,
      "step_time": 25.022456251084805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 173.1875,
      "completions/mean_terminated_length": 173.1875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.25657934695482254,
      "epoch": 0.002732746641963872,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18090035021305084,
      "kl": 0.0015526409697486088,
      "learning_rate": 9.99462714219546e-07,
      "loss": -0.0344,
      "num_tokens": 1623444.0,
      "reward": 0.0714418813586235,
      "reward_std": 0.02954091690480709,
      "rewards/reward_func/mean": 0.0714418813586235,
      "rewards/reward_func/std": 0.02954091690480709,
      "step": 59,
      "step_time": 19.224395401775837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 197.25,
      "completions/mean_terminated_length": 197.25,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.2592005953192711,
      "epoch": 0.002779064381658175,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006600880296900868,
      "kl": 0.0008924548455979675,
      "learning_rate": 9.994534506716072e-07,
      "loss": 0.0,
      "num_tokens": 1648888.0,
      "reward": 0.9555630087852478,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9555630087852478,
      "rewards/reward_func/std": 0.0,
      "step": 60,
      "step_time": 23.297641325742006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 156.4375,
      "completions/mean_terminated_length": 156.4375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.41588329523801804,
      "epoch": 0.002825382121352478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001193221309222281,
      "kl": 0.0017402568482793868,
      "learning_rate": 9.994441871236683e-07,
      "loss": 0.0001,
      "num_tokens": 1684431.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 61,
      "step_time": 19.046519339084625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 121.0,
      "completions/mean_terminated_length": 121.0,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2954227104783058,
      "epoch": 0.002871699861046781,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009379137773066759,
      "kl": 0.001311119005549699,
      "learning_rate": 9.994349235757294e-07,
      "loss": 0.0001,
      "num_tokens": 1705695.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 62,
      "step_time": 14.727204084396362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 167.25,
      "completions/mean_terminated_length": 167.25,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.39179327338933945,
      "epoch": 0.002918017600741084,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009734238847158849,
      "kl": 0.0014574054221156985,
      "learning_rate": 9.994256600277905e-07,
      "loss": 0.0001,
      "num_tokens": 1739715.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 63,
      "step_time": 20.273111023008823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 181.8125,
      "completions/mean_terminated_length": 181.8125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.24816354736685753,
      "epoch": 0.0029643353404353867,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06950819492340088,
      "kl": 0.0007540240039816126,
      "learning_rate": 9.994163964798517e-07,
      "loss": -0.0679,
      "num_tokens": 1762288.0,
      "reward": 0.30373460054397583,
      "reward_std": 0.16260090470314026,
      "rewards/reward_func/mean": 0.30373460054397583,
      "rewards/reward_func/std": 0.16260090470314026,
      "step": 64,
      "step_time": 18.781805235892534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 118.125,
      "completions/mean_terminated_length": 118.125,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.31709306687116623,
      "epoch": 0.0030106530801296896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012367722811177373,
      "kl": 0.0016394878621213138,
      "learning_rate": 9.99407132931913e-07,
      "loss": 0.0001,
      "num_tokens": 1786210.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 65,
      "step_time": 16.068804062902927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 138.625,
      "completions/mean_terminated_length": 138.625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3293949067592621,
      "epoch": 0.0030569708198239925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013366470811888576,
      "kl": 0.0013818318548146635,
      "learning_rate": 9.99397869383974e-07,
      "loss": 0.0001,
      "num_tokens": 1816860.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 66,
      "step_time": 17.489098727703094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 172.875,
      "completions/mean_terminated_length": 172.875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3310137316584587,
      "epoch": 0.0031032885595182955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006151251727715135,
      "kl": 0.0010024872026406229,
      "learning_rate": 9.99388605836035e-07,
      "loss": 0.0001,
      "num_tokens": 1838986.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 67,
      "step_time": 18.4479684792459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 130.8125,
      "completions/mean_terminated_length": 130.8125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.29231955111026764,
      "epoch": 0.0031496062992125984,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001049146754667163,
      "kl": 0.0010573577746981755,
      "learning_rate": 9.993793422880964e-07,
      "loss": 0.0001,
      "num_tokens": 1864583.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 68,
      "step_time": 15.196908507496119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 152.1875,
      "completions/mean_terminated_length": 152.1875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.4232371523976326,
      "epoch": 0.0031959240389069013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007615491631440818,
      "kl": 0.001500288664828986,
      "learning_rate": 9.993700787401575e-07,
      "loss": 0.0001,
      "num_tokens": 1917050.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 69,
      "step_time": 23.43141169473529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 198.5,
      "completions/mean_terminated_length": 198.5,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.21531255170702934,
      "epoch": 0.003242241778601204,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08702373504638672,
      "kl": 0.0010492683068150654,
      "learning_rate": 9.993608151922186e-07,
      "loss": 0.0175,
      "num_tokens": 1954658.0,
      "reward": 0.9878142476081848,
      "reward_std": 0.033297814428806305,
      "rewards/reward_func/mean": 0.9878142476081848,
      "rewards/reward_func/std": 0.033297814428806305,
      "step": 70,
      "step_time": 24.59629587084055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 122.25,
      "completions/mean_terminated_length": 122.25,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2578287646174431,
      "epoch": 0.003288559518295507,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000850670738145709,
      "kl": 0.001109077871660702,
      "learning_rate": 9.993515516442798e-07,
      "loss": 0.0001,
      "num_tokens": 1975414.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 71,
      "step_time": 13.448259372264147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 146.6875,
      "completions/mean_terminated_length": 146.6875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3487822636961937,
      "epoch": 0.00333487725798981,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007427554810419679,
      "kl": 0.001320757990470156,
      "learning_rate": 9.993422880963409e-07,
      "loss": 0.0001,
      "num_tokens": 1997297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 72,
      "step_time": 16.248118489980698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 171.4375,
      "completions/mean_terminated_length": 171.4375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.2556117959320545,
      "epoch": 0.003381194997684113,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.057933662086725235,
      "kl": 0.001071646562195383,
      "learning_rate": 9.99333024548402e-07,
      "loss": -0.077,
      "num_tokens": 2020696.0,
      "reward": 0.1669032722711563,
      "reward_std": 0.07607479393482208,
      "rewards/reward_func/mean": 0.1669032722711563,
      "rewards/reward_func/std": 0.07607479393482208,
      "step": 73,
      "step_time": 20.15308902412653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 191.3125,
      "completions/mean_terminated_length": 191.3125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2702726535499096,
      "epoch": 0.003427512737378416,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004334551631473005,
      "kl": 0.0007109080324880779,
      "learning_rate": 9.993237610004631e-07,
      "loss": 0.0,
      "num_tokens": 2045085.0,
      "reward": 0.6227038502693176,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6227038502693176,
      "rewards/reward_func/std": 0.0,
      "step": 74,
      "step_time": 20.211932979524136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 212.5625,
      "completions/mean_terminated_length": 212.5625,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.44178541004657745,
      "epoch": 0.003473830477072719,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0010382463224232197,
      "kl": 0.0013108167913742363,
      "learning_rate": 9.993144974525243e-07,
      "loss": -0.0006,
      "num_tokens": 2082582.0,
      "reward": 5.98188080402906e-07,
      "reward_std": 1.0700713346523116e-06,
      "rewards/reward_func/mean": 5.98188080402906e-07,
      "rewards/reward_func/std": 1.0700713346523116e-06,
      "step": 75,
      "step_time": 28.365940377116203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 151.8125,
      "completions/mean_terminated_length": 151.8125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.4370253086090088,
      "epoch": 0.0035201482167670217,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005119937704876065,
      "kl": 0.0011659206065814942,
      "learning_rate": 9.993052339045854e-07,
      "loss": 0.0001,
      "num_tokens": 2104723.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 76,
      "step_time": 16.270573265850544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 186.5625,
      "completions/mean_terminated_length": 186.5625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.14407925307750702,
      "epoch": 0.0035664659564613246,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06740093976259232,
      "kl": 0.0008118026380543597,
      "learning_rate": 9.992959703566465e-07,
      "loss": -0.0353,
      "num_tokens": 2141004.0,
      "reward": 0.31709614396095276,
      "reward_std": 0.3274954855442047,
      "rewards/reward_func/mean": 0.31709614396095276,
      "rewards/reward_func/std": 0.3274955153465271,
      "step": 77,
      "step_time": 22.43112090975046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 150.5625,
      "completions/mean_terminated_length": 150.5625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.2840983420610428,
      "epoch": 0.0036127836961556276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011623201426118612,
      "kl": 0.0012132947449572384,
      "learning_rate": 9.992867068087076e-07,
      "loss": 0.0001,
      "num_tokens": 2177349.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 78,
      "step_time": 18.93928075954318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.4052634462714195,
      "epoch": 0.0036591014358499305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00051750527927652,
      "kl": 0.001296806090977043,
      "learning_rate": 9.992774432607688e-07,
      "loss": 0.0001,
      "num_tokens": 2198186.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 79,
      "step_time": 19.795941203832626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 123.75,
      "completions/mean_terminated_length": 123.75,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.27210986614227295,
      "epoch": 0.0037054191755442334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011587308254092932,
      "kl": 0.0010845881624845788,
      "learning_rate": 9.992681797128299e-07,
      "loss": 0.0001,
      "num_tokens": 2220902.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 80,
      "step_time": 14.425421085208654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 190.8125,
      "completions/mean_terminated_length": 190.8125,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.2371511086821556,
      "epoch": 0.0037517369152385363,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05840691179037094,
      "kl": 0.0011019718513125554,
      "learning_rate": 9.99258916164891e-07,
      "loss": -0.0436,
      "num_tokens": 2252787.0,
      "reward": 0.9001584053039551,
      "reward_std": 0.05953400582075119,
      "rewards/reward_func/mean": 0.9001584053039551,
      "rewards/reward_func/std": 0.05953400954604149,
      "step": 81,
      "step_time": 21.62469592690468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 185.9375,
      "completions/mean_terminated_length": 185.9375,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.3472830802202225,
      "epoch": 0.0037980546549328392,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001633723615668714,
      "kl": 0.0014273902197601274,
      "learning_rate": 9.992496526169523e-07,
      "loss": 0.0001,
      "num_tokens": 2283138.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 82,
      "step_time": 20.354945544153452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 152.9375,
      "completions/mean_terminated_length": 152.9375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.4043729901313782,
      "epoch": 0.003844372394627142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005513399373739958,
      "kl": 0.001414911908796057,
      "learning_rate": 9.992403890690135e-07,
      "loss": 0.0001,
      "num_tokens": 2321425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 83,
      "step_time": 20.47370319440961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 158.5,
      "completions/mean_terminated_length": 158.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.24156905710697174,
      "epoch": 0.003890690134321445,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008328308467753232,
      "kl": 0.0012390459014568478,
      "learning_rate": 9.992311255210746e-07,
      "loss": 0.0001,
      "num_tokens": 2342617.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 84,
      "step_time": 16.063272561877966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 228.125,
      "completions/mean_terminated_length": 228.125,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.28923826664686203,
      "epoch": 0.003937007874015748,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2853640615940094,
      "kl": 0.002009062358411029,
      "learning_rate": 9.992218619731357e-07,
      "loss": -0.0715,
      "num_tokens": 2376203.0,
      "reward": 0.5002837181091309,
      "reward_std": 0.29904577136039734,
      "rewards/reward_func/mean": 0.5002837181091309,
      "rewards/reward_func/std": 0.29904577136039734,
      "step": 85,
      "step_time": 28.082806132733822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 158.8125,
      "completions/mean_terminated_length": 158.8125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.1473250687122345,
      "epoch": 0.003983325613710051,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06612526625394821,
      "kl": 0.0007407844095723704,
      "learning_rate": 9.992125984251968e-07,
      "loss": -0.063,
      "num_tokens": 2398520.0,
      "reward": 0.9102447628974915,
      "reward_std": 0.06027979403734207,
      "rewards/reward_func/mean": 0.9102447628974915,
      "rewards/reward_func/std": 0.06027979776263237,
      "step": 86,
      "step_time": 19.420553267002106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 336.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 177.9375,
      "completions/mean_terminated_length": 177.9375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3603175953030586,
      "epoch": 0.004029643353404354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08079643547534943,
      "kl": 0.0015568426460959017,
      "learning_rate": 9.99203334877258e-07,
      "loss": -0.0448,
      "num_tokens": 2427959.0,
      "reward": 0.007332447916269302,
      "reward_std": 0.02932979166507721,
      "rewards/reward_func/mean": 0.007332447916269302,
      "rewards/reward_func/std": 0.02932979352772236,
      "step": 87,
      "step_time": 29.163668405264616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 116.0625,
      "completions/mean_terminated_length": 116.0625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.28036900609731674,
      "epoch": 0.004075961093098657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006897732964716852,
      "kl": 0.001202350395033136,
      "learning_rate": 9.99194071329319e-07,
      "loss": 0.0001,
      "num_tokens": 2449688.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 88,
      "step_time": 12.958728298544884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 130.0,
      "completions/mean_terminated_length": 130.0,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.31368735432624817,
      "epoch": 0.00412227883279296,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016356243286281824,
      "kl": 0.0013772174424957484,
      "learning_rate": 9.991848077813802e-07,
      "loss": 0.0001,
      "num_tokens": 2476584.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 89,
      "step_time": 16.863355983048677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 131.25,
      "completions/mean_terminated_length": 131.25,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.20497016608715057,
      "epoch": 0.0041685965724872626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011138824047520757,
      "kl": 0.0009810531337279826,
      "learning_rate": 9.991755442334413e-07,
      "loss": 0.0,
      "num_tokens": 2496108.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 90,
      "step_time": 13.149852402508259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 125.75,
      "completions/mean_terminated_length": 125.75,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.23887991160154343,
      "epoch": 0.0042149143121815655,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040979571640491486,
      "kl": 0.0019737385446205735,
      "learning_rate": 9.991662806855025e-07,
      "loss": 0.0001,
      "num_tokens": 2515560.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 91,
      "step_time": 13.428842436522245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 292.375,
      "completions/mean_terminated_length": 292.375,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "entropy": 0.16698342561721802,
      "epoch": 0.004261232051875868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000315846991725266,
      "kl": 0.0005647511134156957,
      "learning_rate": 9.991570171375636e-07,
      "loss": 0.0,
      "num_tokens": 2544798.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 92,
      "step_time": 28.36004311963916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 172.1875,
      "completions/mean_terminated_length": 172.1875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3940913528203964,
      "epoch": 0.004307549791570171,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008397761266678572,
      "kl": 0.0015314221964217722,
      "learning_rate": 9.991477535896247e-07,
      "loss": 0.0001,
      "num_tokens": 2595377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 93,
      "step_time": 24.672285731881857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 186.3125,
      "completions/mean_terminated_length": 186.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.44038259983062744,
      "epoch": 0.004353867531264474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005806896951980889,
      "kl": 0.0015431945794261992,
      "learning_rate": 9.991384900416858e-07,
      "loss": 0.0001,
      "num_tokens": 2618486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 94,
      "step_time": 21.164959002286196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 129.5,
      "completions/mean_terminated_length": 129.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.331402949988842,
      "epoch": 0.004400185270958777,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008044900605455041,
      "kl": 0.0013099743518978357,
      "learning_rate": 9.991292264937472e-07,
      "loss": 0.0001,
      "num_tokens": 2651630.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 95,
      "step_time": 16.487524412572384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.18016548454761505,
      "epoch": 0.00444650301065308,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00039947155164554715,
      "kl": 0.0006883219321025535,
      "learning_rate": 9.991199629458083e-07,
      "loss": 0.0,
      "num_tokens": 2672847.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 96,
      "step_time": 17.446692943572998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 106.6875,
      "completions/mean_terminated_length": 106.6875,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.1923675499856472,
      "epoch": 0.004492820750347383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013796341372653842,
      "kl": 0.001208607602166012,
      "learning_rate": 9.991106993978692e-07,
      "loss": 0.0001,
      "num_tokens": 2691770.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 97,
      "step_time": 12.108869213610888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 175.375,
      "completions/mean_terminated_length": 175.375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.40798864513635635,
      "epoch": 0.004539138490041686,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036100484430789948,
      "kl": 0.0022683410497847944,
      "learning_rate": 9.991014358499306e-07,
      "loss": 0.0001,
      "num_tokens": 2738384.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 98,
      "step_time": 24.993323389440775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 176.125,
      "completions/mean_terminated_length": 176.125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.33561981469392776,
      "epoch": 0.004585456229735989,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005170278600417078,
      "kl": 0.001088321630959399,
      "learning_rate": 9.990921723019917e-07,
      "loss": 0.0001,
      "num_tokens": 2765506.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 99,
      "step_time": 20.452530715614557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 172.6875,
      "completions/mean_terminated_length": 172.6875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3813248723745346,
      "epoch": 0.004631773969430292,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019012981792911887,
      "kl": 0.001573115645442158,
      "learning_rate": 9.990829087540528e-07,
      "loss": 0.0001,
      "num_tokens": 2800061.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 100,
      "step_time": 21.43118468299508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 161.8125,
      "completions/mean_terminated_length": 161.8125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.25710832327604294,
      "epoch": 0.004678091709124595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004794577253051102,
      "kl": 0.0007824498461559415,
      "learning_rate": 9.99073645206114e-07,
      "loss": 0.0,
      "num_tokens": 2830154.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 101,
      "step_time": 19.95168798044324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 172.6875,
      "completions/mean_terminated_length": 172.6875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3496035486459732,
      "epoch": 0.004724409448818898,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009181686327792704,
      "kl": 0.0015211670834105462,
      "learning_rate": 9.99064381658175e-07,
      "loss": 0.0001,
      "num_tokens": 2880581.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 102,
      "step_time": 25.15399621427059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 228.3125,
      "completions/mean_terminated_length": 228.3125,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "entropy": 0.17268018424510956,
      "epoch": 0.0047707271885132005,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004131859459448606,
      "kl": 0.0007152085017878562,
      "learning_rate": 9.990551181102362e-07,
      "loss": 0.0,
      "num_tokens": 2910346.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 103,
      "step_time": 23.317954447120428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 154.25,
      "completions/mean_terminated_length": 154.25,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3152758330106735,
      "epoch": 0.004817044928207503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005372532177716494,
      "kl": 0.0011037293006666005,
      "learning_rate": 9.990458545622973e-07,
      "loss": 0.0001,
      "num_tokens": 2932686.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 104,
      "step_time": 16.74936816468835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 240.9375,
      "completions/mean_terminated_length": 240.9375,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.18453437089920044,
      "epoch": 0.004863362667901806,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04752209037542343,
      "kl": 0.0006047881179256365,
      "learning_rate": 9.990365910143584e-07,
      "loss": 0.0259,
      "num_tokens": 2956525.0,
      "reward": 0.9409773349761963,
      "reward_std": 0.16128070652484894,
      "rewards/reward_func/mean": 0.9409773349761963,
      "rewards/reward_func/std": 0.16128070652484894,
      "step": 105,
      "step_time": 22.666839264333248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 121.875,
      "completions/mean_terminated_length": 121.875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2953808978199959,
      "epoch": 0.004909680407596109,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000951802299823612,
      "kl": 0.0012901290610898286,
      "learning_rate": 9.990273274664196e-07,
      "loss": 0.0001,
      "num_tokens": 2976331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 106,
      "step_time": 12.871876332908869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 133.5,
      "completions/mean_terminated_length": 133.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.30508697777986526,
      "epoch": 0.004955998147290412,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011718884343281388,
      "kl": 0.0012946214410476387,
      "learning_rate": 9.990180639184807e-07,
      "loss": 0.0001,
      "num_tokens": 3012227.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 107,
      "step_time": 18.159315083175898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 149.125,
      "completions/mean_terminated_length": 149.125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.1483689248561859,
      "epoch": 0.005002315886984715,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027634147554636,
      "kl": 0.0009971036633942276,
      "learning_rate": 9.99008800370542e-07,
      "loss": 0.0,
      "num_tokens": 3049013.0,
      "reward": 0.619507372379303,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.619507372379303,
      "rewards/reward_func/std": 0.0,
      "step": 108,
      "step_time": 18.731859609484673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 217.5,
      "completions/mean_terminated_length": 217.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.2554764822125435,
      "epoch": 0.005048633626679018,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09361075609922409,
      "kl": 0.0016304054006468505,
      "learning_rate": 9.98999536822603e-07,
      "loss": -0.1054,
      "num_tokens": 3092461.0,
      "reward": 0.7015107870101929,
      "reward_std": 0.4194250702857971,
      "rewards/reward_func/mean": 0.7015107870101929,
      "rewards/reward_func/std": 0.4194250702857971,
      "step": 109,
      "step_time": 27.058271024376154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 201.25,
      "completions/mean_terminated_length": 201.25,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.24739449098706245,
      "epoch": 0.005094951366373321,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005664157797582448,
      "kl": 0.0010661086562322453,
      "learning_rate": 9.98990273274664e-07,
      "loss": 0.0001,
      "num_tokens": 3116113.0,
      "reward": 0.24110545217990875,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.24110545217990875,
      "rewards/reward_func/std": 0.0,
      "step": 110,
      "step_time": 22.920831225812435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 140.9375,
      "completions/mean_terminated_length": 140.9375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.27028920128941536,
      "epoch": 0.005141269106067624,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000512939237523824,
      "kl": 0.0008654707198729739,
      "learning_rate": 9.989810097267252e-07,
      "loss": 0.0,
      "num_tokens": 3136016.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 111,
      "step_time": 14.48648601397872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 459.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 386.3125,
      "completions/mean_terminated_length": 386.3125,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "entropy": 0.27152176201343536,
      "epoch": 0.005187586845761927,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04701411351561546,
      "kl": 0.0008469039894407615,
      "learning_rate": 9.989717461787865e-07,
      "loss": -0.0376,
      "num_tokens": 3178261.0,
      "reward": 0.7923111915588379,
      "reward_std": 0.0785004124045372,
      "rewards/reward_func/mean": 0.7923111915588379,
      "rewards/reward_func/std": 0.0785004124045372,
      "step": 112,
      "step_time": 39.99580450728536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 170.6875,
      "completions/mean_terminated_length": 170.6875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.14264867082238197,
      "epoch": 0.00523390458545623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005562285077758133,
      "kl": 0.0005924119759583846,
      "learning_rate": 9.989624826308476e-07,
      "loss": 0.0,
      "num_tokens": 3221536.0,
      "reward": 0.8890097737312317,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8890097737312317,
      "rewards/reward_func/std": 0.0,
      "step": 113,
      "step_time": 22.939540166407824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 139.3125,
      "completions/mean_terminated_length": 139.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2521078996360302,
      "epoch": 0.005280222325150533,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008059011306613684,
      "kl": 0.001156355268904008,
      "learning_rate": 9.989532190829088e-07,
      "loss": 0.0001,
      "num_tokens": 3242101.0,
      "reward": 0.0004407913947943598,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0004407913947943598,
      "rewards/reward_func/std": 0.0,
      "step": 114,
      "step_time": 16.212469674646854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 236.9375,
      "completions/mean_terminated_length": 236.9375,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.3463738113641739,
      "epoch": 0.0053265400648448355,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.052856896072626114,
      "kl": 0.001222132268594578,
      "learning_rate": 9.989439555349699e-07,
      "loss": -0.1089,
      "num_tokens": 3271172.0,
      "reward": 0.4277864992618561,
      "reward_std": 0.5015459060668945,
      "rewards/reward_func/mean": 0.4277864992618561,
      "rewards/reward_func/std": 0.5015459060668945,
      "step": 115,
      "step_time": 25.421065870672464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 134.125,
      "completions/mean_terminated_length": 134.125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.27230648696422577,
      "epoch": 0.005372857804539138,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010126074776053429,
      "kl": 0.0011005645501427352,
      "learning_rate": 9.98934691987031e-07,
      "loss": 0.0001,
      "num_tokens": 3291558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 116,
      "step_time": 14.782246958464384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 142.5,
      "completions/mean_terminated_length": 142.5,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.268955335021019,
      "epoch": 0.005419175544233441,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000778664369136095,
      "kl": 0.001104289636714384,
      "learning_rate": 9.989254284390921e-07,
      "loss": 0.0001,
      "num_tokens": 3318830.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 117,
      "step_time": 17.054896883666515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 195.875,
      "completions/mean_terminated_length": 195.875,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.27606628835201263,
      "epoch": 0.005465493283927744,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008477095398120582,
      "kl": 0.0009050240332726389,
      "learning_rate": 9.989161648911533e-07,
      "loss": 0.0,
      "num_tokens": 3359868.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 118,
      "step_time": 23.64735871180892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 186.875,
      "completions/mean_terminated_length": 186.875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.4049151912331581,
      "epoch": 0.005511811023622047,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06125951185822487,
      "kl": 0.0013729215424973518,
      "learning_rate": 9.989069013432144e-07,
      "loss": -0.053,
      "num_tokens": 3380586.0,
      "reward": 0.0020324073266237974,
      "reward_std": 0.00812962930649519,
      "rewards/reward_func/mean": 0.0020324073266237974,
      "rewards/reward_func/std": 0.008129630237817764,
      "step": 119,
      "step_time": 19.843410819768906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 139.6875,
      "completions/mean_terminated_length": 139.6875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.23628076165914536,
      "epoch": 0.00555812876331635,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000650152622256428,
      "kl": 0.0012098945735488087,
      "learning_rate": 9.988976377952755e-07,
      "loss": 0.0001,
      "num_tokens": 3400261.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 120,
      "step_time": 14.221643339842558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 114.375,
      "completions/mean_terminated_length": 114.375,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.2281496524810791,
      "epoch": 0.005604446503010653,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007070648716762662,
      "kl": 0.001058092893799767,
      "learning_rate": 9.988883742473366e-07,
      "loss": 0.0001,
      "num_tokens": 3419659.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 121,
      "step_time": 13.323605943471193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 117.625,
      "completions/mean_terminated_length": 117.625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27677495777606964,
      "epoch": 0.005650764242704956,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007823980995453894,
      "kl": 0.0011881452519446611,
      "learning_rate": 9.988791106993978e-07,
      "loss": 0.0001,
      "num_tokens": 3439109.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 122,
      "step_time": 12.829030204564333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 165.8125,
      "completions/mean_terminated_length": 165.8125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.26315319538116455,
      "epoch": 0.005697081982399259,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10150697827339172,
      "kl": 0.001116703759180382,
      "learning_rate": 9.988698471514589e-07,
      "loss": 0.0457,
      "num_tokens": 3464386.0,
      "reward": 0.7361464500427246,
      "reward_std": 0.36629024147987366,
      "rewards/reward_func/mean": 0.7361464500427246,
      "rewards/reward_func/std": 0.36629024147987366,
      "step": 123,
      "step_time": 18.046367309987545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 139.3125,
      "completions/mean_terminated_length": 139.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.19785001128911972,
      "epoch": 0.005743399722093562,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08791724592447281,
      "kl": 0.0007540077349403873,
      "learning_rate": 9.9886058360352e-07,
      "loss": -0.0683,
      "num_tokens": 3492615.0,
      "reward": 0.47804224491119385,
      "reward_std": 0.3269173204898834,
      "rewards/reward_func/mean": 0.47804224491119385,
      "rewards/reward_func/std": 0.3269173800945282,
      "step": 124,
      "step_time": 17.178758315742016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 188.1875,
      "completions/mean_terminated_length": 188.1875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.20732641592621803,
      "epoch": 0.005789717461787865,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05716419219970703,
      "kl": 0.000678992597386241,
      "learning_rate": 9.988513200555813e-07,
      "loss": -0.0529,
      "num_tokens": 3538618.0,
      "reward": 0.9079843759536743,
      "reward_std": 0.0359191857278347,
      "rewards/reward_func/mean": 0.9079843759536743,
      "rewards/reward_func/std": 0.035919204354286194,
      "step": 125,
      "step_time": 25.90315380319953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 163.5,
      "completions/mean_terminated_length": 163.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.33654970675706863,
      "epoch": 0.005836035201482168,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006308644078671932,
      "kl": 0.0012278756767045707,
      "learning_rate": 9.988420565076425e-07,
      "loss": 0.0001,
      "num_tokens": 3567314.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 126,
      "step_time": 18.876841440796852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.33673005551099777,
      "epoch": 0.0058823529411764705,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09591037034988403,
      "kl": 0.001308549428358674,
      "learning_rate": 9.988327929597036e-07,
      "loss": 0.0127,
      "num_tokens": 3590446.0,
      "reward": 0.28263968229293823,
      "reward_std": 0.19680501520633698,
      "rewards/reward_func/mean": 0.28263968229293823,
      "rewards/reward_func/std": 0.19680503010749817,
      "step": 127,
      "step_time": 20.69503689929843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 186.1875,
      "completions/mean_terminated_length": 186.1875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3700679764151573,
      "epoch": 0.005928670680870773,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010768556967377663,
      "kl": 0.0012731784372590482,
      "learning_rate": 9.988235294117647e-07,
      "loss": 0.0001,
      "num_tokens": 3618833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 128,
      "step_time": 21.974650118499994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 117.0625,
      "completions/mean_terminated_length": 117.0625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.23749050125479698,
      "epoch": 0.005974988420565076,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010812852997332811,
      "kl": 0.0011148849880555645,
      "learning_rate": 9.988142658638258e-07,
      "loss": 0.0001,
      "num_tokens": 3641634.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 129,
      "step_time": 14.077108316123486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 193.25,
      "completions/mean_terminated_length": 193.25,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.2998213544487953,
      "epoch": 0.006021306160259379,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006937870057299733,
      "kl": 0.0009626495302654803,
      "learning_rate": 9.98805002315887e-07,
      "loss": 0.0,
      "num_tokens": 3666214.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 130,
      "step_time": 21.575381591916084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 157.25,
      "completions/mean_terminated_length": 157.25,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3175879344344139,
      "epoch": 0.006067623899953682,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014436630299314857,
      "kl": 0.001178477396024391,
      "learning_rate": 9.98795738767948e-07,
      "loss": 0.0001,
      "num_tokens": 3686746.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 131,
      "step_time": 16.170368444174528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 143.75,
      "completions/mean_terminated_length": 143.75,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2595654986798763,
      "epoch": 0.006113941639647985,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011029423912987113,
      "kl": 0.0011006823042407632,
      "learning_rate": 9.987864752200092e-07,
      "loss": 0.0001,
      "num_tokens": 3709110.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 132,
      "step_time": 16.029532201588154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 150.25,
      "completions/mean_terminated_length": 150.25,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3049924820661545,
      "epoch": 0.006160259379342288,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002257963875308633,
      "kl": 0.0015856284298934042,
      "learning_rate": 9.987772116720703e-07,
      "loss": 0.0001,
      "num_tokens": 3738506.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 133,
      "step_time": 17.504866629838943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 212.5,
      "completions/mean_terminated_length": 212.5,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.25285639613866806,
      "epoch": 0.006206577119036591,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04643546789884567,
      "kl": 0.0009889291250146925,
      "learning_rate": 9.987679481241315e-07,
      "loss": -0.0274,
      "num_tokens": 3760530.0,
      "reward": 0.9934226274490356,
      "reward_std": 0.017972838133573532,
      "rewards/reward_func/mean": 0.9934226274490356,
      "rewards/reward_func/std": 0.01797284372150898,
      "step": 134,
      "step_time": 21.360256396234035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 183.75,
      "completions/mean_terminated_length": 183.75,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3675421178340912,
      "epoch": 0.006252894858730894,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008711877162568271,
      "kl": 0.001303886208916083,
      "learning_rate": 9.987586845761926e-07,
      "loss": 0.0001,
      "num_tokens": 3788702.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 135,
      "step_time": 20.955825198441744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 135.0,
      "completions/mean_terminated_length": 135.0,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.28805623203516006,
      "epoch": 0.006299212598425197,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011562267318367958,
      "kl": 0.0012812073400709778,
      "learning_rate": 9.987494210282537e-07,
      "loss": 0.0001,
      "num_tokens": 3809838.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 136,
      "step_time": 14.344765190035105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 168.9375,
      "completions/mean_terminated_length": 168.9375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.17872954905033112,
      "epoch": 0.0063455303381195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005084734293632209,
      "kl": 0.0007189370808191597,
      "learning_rate": 9.987401574803148e-07,
      "loss": 0.0,
      "num_tokens": 3831341.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 137,
      "step_time": 21.364930722862482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 177.5625,
      "completions/mean_terminated_length": 177.5625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.24735918268561363,
      "epoch": 0.006391848077813803,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010594031773507595,
      "kl": 0.001022157768602483,
      "learning_rate": 9.987308939323762e-07,
      "loss": 0.0001,
      "num_tokens": 3852998.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 138,
      "step_time": 19.29247647151351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 235.3125,
      "completions/mean_terminated_length": 235.3125,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "entropy": 0.24933470785617828,
      "epoch": 0.0064381658175081055,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05908169969916344,
      "kl": 0.0011135565146105364,
      "learning_rate": 9.987216303844373e-07,
      "loss": 0.0113,
      "num_tokens": 3876779.0,
      "reward": 0.9620516300201416,
      "reward_std": 0.08158649504184723,
      "rewards/reward_func/mean": 0.9620516300201416,
      "rewards/reward_func/std": 0.08158650249242783,
      "step": 139,
      "step_time": 23.81665090844035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 208.625,
      "completions/mean_terminated_length": 208.625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3989019989967346,
      "epoch": 0.006484483557202408,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05581681802868843,
      "kl": 0.0011897674994543195,
      "learning_rate": 9.987123668364982e-07,
      "loss": -0.0434,
      "num_tokens": 3904037.0,
      "reward": 0.012167919427156448,
      "reward_std": 0.048671673983335495,
      "rewards/reward_func/mean": 0.012167919427156448,
      "rewards/reward_func/std": 0.04867167770862579,
      "step": 140,
      "step_time": 22.690000787377357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 330.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 194.6875,
      "completions/mean_terminated_length": 194.6875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3049769699573517,
      "epoch": 0.006530801296896711,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08445534855127335,
      "kl": 0.0035717372375074774,
      "learning_rate": 9.987031032885593e-07,
      "loss": -0.1389,
      "num_tokens": 3928576.0,
      "reward": 0.1740274727344513,
      "reward_std": 0.1714896857738495,
      "rewards/reward_func/mean": 0.1740274727344513,
      "rewards/reward_func/std": 0.1714896857738495,
      "step": 141,
      "step_time": 28.506950981914997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 150.875,
      "completions/mean_terminated_length": 150.875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.32121188193559647,
      "epoch": 0.006577119036591014,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000673571543302387,
      "kl": 0.0014779901539441198,
      "learning_rate": 9.986938397406207e-07,
      "loss": 0.0001,
      "num_tokens": 3956654.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 142,
      "step_time": 18.16906550899148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 193.9375,
      "completions/mean_terminated_length": 193.9375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.39169544726610184,
      "epoch": 0.006623436776285317,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07600442320108414,
      "kl": 0.0012212564470246434,
      "learning_rate": 9.986845761926818e-07,
      "loss": -0.0145,
      "num_tokens": 3981229.0,
      "reward": 0.03668078035116196,
      "reward_std": 0.06561657041311264,
      "rewards/reward_func/mean": 0.03668078035116196,
      "rewards/reward_func/std": 0.06561657786369324,
      "step": 143,
      "step_time": 19.928812380880117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 334.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 259.0,
      "completions/mean_terminated_length": 259.0,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "entropy": 0.29782533645629883,
      "epoch": 0.00666975451597962,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04033688083291054,
      "kl": 0.0008177869021892548,
      "learning_rate": 9.98675312644743e-07,
      "loss": 0.0433,
      "num_tokens": 4006077.0,
      "reward": 0.24205882847309113,
      "reward_std": 0.42336198687553406,
      "rewards/reward_func/mean": 0.24205882847309113,
      "rewards/reward_func/std": 0.42336201667785645,
      "step": 144,
      "step_time": 28.188720546662807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 160.4375,
      "completions/mean_terminated_length": 160.4375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3879060745239258,
      "epoch": 0.006716072255673923,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001458464190363884,
      "kl": 0.001835305680288002,
      "learning_rate": 9.98666049096804e-07,
      "loss": 0.0001,
      "num_tokens": 4035540.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 145,
      "step_time": 20.65491282194853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 137.4375,
      "completions/mean_terminated_length": 137.4375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.23363490775227547,
      "epoch": 0.006762389995368226,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007848707609809935,
      "kl": 0.0009721961687318981,
      "learning_rate": 9.986567855488652e-07,
      "loss": 0.0,
      "num_tokens": 4055195.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 146,
      "step_time": 14.133691564202309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 148.75,
      "completions/mean_terminated_length": 148.75,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3750767633318901,
      "epoch": 0.006808707735062529,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001035345601849258,
      "kl": 0.0017306151567026973,
      "learning_rate": 9.986475220009263e-07,
      "loss": 0.0001,
      "num_tokens": 4108327.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 147,
      "step_time": 24.681630488485098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 123.0,
      "completions/mean_terminated_length": 123.0,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.21959315985441208,
      "epoch": 0.006855025474756832,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011265820357948542,
      "kl": 0.0010525864054216072,
      "learning_rate": 9.986382584529874e-07,
      "loss": 0.0001,
      "num_tokens": 4127991.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 148,
      "step_time": 13.904216725379229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 132.25,
      "completions/mean_terminated_length": 132.25,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.37593110650777817,
      "epoch": 0.006901343214451135,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001913378364406526,
      "kl": 0.0012810016341973096,
      "learning_rate": 9.986289949050486e-07,
      "loss": 0.0001,
      "num_tokens": 4153067.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 149,
      "step_time": 15.85357840359211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 180.0,
      "completions/mean_terminated_length": 180.0,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.42250946909189224,
      "epoch": 0.006947660954145438,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005991277866996825,
      "kl": 0.0015141093172132969,
      "learning_rate": 9.986197313571097e-07,
      "loss": 0.0001,
      "num_tokens": 4197211.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 150,
      "step_time": 23.63872228562832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 169.8125,
      "completions/mean_terminated_length": 169.8125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.2225952185690403,
      "epoch": 0.0069939786938397405,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005857797805219889,
      "kl": 0.0009670153085608035,
      "learning_rate": 9.986104678091708e-07,
      "loss": 0.0,
      "num_tokens": 4217784.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 151,
      "step_time": 18.001822039484978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 216.125,
      "completions/mean_terminated_length": 216.125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.35866475105285645,
      "epoch": 0.0070402964335340434,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07115766406059265,
      "kl": 0.0011862863320857286,
      "learning_rate": 9.98601204261232e-07,
      "loss": 0.0495,
      "num_tokens": 4239146.0,
      "reward": 0.9892370700836182,
      "reward_std": 0.04305167496204376,
      "rewards/reward_func/mean": 0.9892370700836182,
      "rewards/reward_func/std": 0.04305167496204376,
      "step": 152,
      "step_time": 22.355235513299704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 162.125,
      "completions/mean_terminated_length": 162.125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.26412268728017807,
      "epoch": 0.007086614173228346,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011610282817855477,
      "kl": 0.001230985508300364,
      "learning_rate": 9.98591940713293e-07,
      "loss": 0.0001,
      "num_tokens": 4266940.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 153,
      "step_time": 18.30440727248788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 152.75,
      "completions/mean_terminated_length": 152.75,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.39102543890476227,
      "epoch": 0.007132931912922649,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010339897125959396,
      "kl": 0.0018844558508135378,
      "learning_rate": 9.985826771653542e-07,
      "loss": 0.0001,
      "num_tokens": 4319096.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 154,
      "step_time": 23.435311898589134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 121.375,
      "completions/mean_terminated_length": 121.375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.29395852982997894,
      "epoch": 0.007179249652616952,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007433706778101623,
      "kl": 0.0009080634627025574,
      "learning_rate": 9.985734136174155e-07,
      "loss": 0.0,
      "num_tokens": 4341438.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 155,
      "step_time": 13.602233476936817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 140.5625,
      "completions/mean_terminated_length": 140.5625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.19000105187296867,
      "epoch": 0.007225567392311255,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004755662230309099,
      "kl": 0.0007904800295364112,
      "learning_rate": 9.985641500694766e-07,
      "loss": 0.0,
      "num_tokens": 4364679.0,
      "reward": 0.25042009353637695,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.25042009353637695,
      "rewards/reward_func/std": 0.0,
      "step": 156,
      "step_time": 14.93125580623746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 207.75,
      "completions/mean_terminated_length": 207.75,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.22341416031122208,
      "epoch": 0.007271885132005558,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05051978677511215,
      "kl": 0.0008343583176610991,
      "learning_rate": 9.985548865215378e-07,
      "loss": 0.0193,
      "num_tokens": 4402195.0,
      "reward": 0.596699595451355,
      "reward_std": 0.4261772036552429,
      "rewards/reward_func/mean": 0.596699595451355,
      "rewards/reward_func/std": 0.4261772334575653,
      "step": 157,
      "step_time": 24.41818241775036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 392.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 266.1875,
      "completions/mean_terminated_length": 266.1875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.36868835985660553,
      "epoch": 0.007318202871699861,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05326659977436066,
      "kl": 0.0010871543636312708,
      "learning_rate": 9.985456229735989e-07,
      "loss": -0.2973,
      "num_tokens": 4443286.0,
      "reward": 0.3007088899612427,
      "reward_std": 0.3702571392059326,
      "rewards/reward_func/mean": 0.3007088899612427,
      "rewards/reward_func/std": 0.3702571392059326,
      "step": 158,
      "step_time": 36.127769846469164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 167.1875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.19336718693375587,
      "epoch": 0.007364520611394164,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000494747597258538,
      "kl": 0.000735267938580364,
      "learning_rate": 9.9853635942566e-07,
      "loss": 0.0,
      "num_tokens": 4476393.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 159,
      "step_time": 20.922476079314947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 156.8125,
      "completions/mean_terminated_length": 156.8125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.1699804738163948,
      "epoch": 0.007410838351088467,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018062412273138762,
      "kl": 0.001229065004736185,
      "learning_rate": 9.985270958777211e-07,
      "loss": 0.0001,
      "num_tokens": 4510406.0,
      "reward": 0.9000876545906067,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9000876545906067,
      "rewards/reward_func/std": 0.0,
      "step": 160,
      "step_time": 18.807097870856524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 141.375,
      "completions/mean_terminated_length": 141.375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3330295979976654,
      "epoch": 0.00745715609078277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005328988772816956,
      "kl": 0.0011647845094557852,
      "learning_rate": 9.985178323297823e-07,
      "loss": 0.0001,
      "num_tokens": 4546508.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 161,
      "step_time": 18.652416814118624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 128.9375,
      "completions/mean_terminated_length": 128.9375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3115380331873894,
      "epoch": 0.007503473830477073,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018411468481644988,
      "kl": 0.0017008328577503562,
      "learning_rate": 9.985085687818434e-07,
      "loss": 0.0001,
      "num_tokens": 4575883.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 162,
      "step_time": 16.12642402946949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 163.25,
      "completions/mean_terminated_length": 163.25,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.18948416784405708,
      "epoch": 0.0075497915701713755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00046944760833866894,
      "kl": 0.000663085505948402,
      "learning_rate": 9.984993052339045e-07,
      "loss": 0.0,
      "num_tokens": 4618527.0,
      "reward": 0.8574039340019226,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8574039340019226,
      "rewards/reward_func/std": 0.0,
      "step": 163,
      "step_time": 22.54652241244912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 165.625,
      "completions/mean_terminated_length": 165.625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.2251650206744671,
      "epoch": 0.0075961093098656784,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008289703982882202,
      "kl": 0.0009271571907447651,
      "learning_rate": 9.984900416859656e-07,
      "loss": 0.0,
      "num_tokens": 4640569.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 164,
      "step_time": 18.0803255289793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 130.5625,
      "completions/mean_terminated_length": 130.5625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3675830289721489,
      "epoch": 0.007642427049559981,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009684573160484433,
      "kl": 0.0015152069390751421,
      "learning_rate": 9.984807781380268e-07,
      "loss": 0.0001,
      "num_tokens": 4663058.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 165,
      "step_time": 14.896864034235477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 158.0625,
      "completions/mean_terminated_length": 158.0625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3842160478234291,
      "epoch": 0.007688744789254284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019603169057518244,
      "kl": 0.0025681756669655442,
      "learning_rate": 9.984715145900879e-07,
      "loss": 0.0001,
      "num_tokens": 4693891.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 166,
      "step_time": 18.503708496689796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 138.6875,
      "completions/mean_terminated_length": 138.6875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.29680949449539185,
      "epoch": 0.007735062528948587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007003445643931627,
      "kl": 0.0013395841815508902,
      "learning_rate": 9.98462251042149e-07,
      "loss": 0.0001,
      "num_tokens": 4716878.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 167,
      "step_time": 15.016962468624115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 178.3125,
      "completions/mean_terminated_length": 178.3125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.2283756174147129,
      "epoch": 0.00778138026864289,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06226864829659462,
      "kl": 0.0007998852815944701,
      "learning_rate": 9.984529874942104e-07,
      "loss": 0.001,
      "num_tokens": 4745587.0,
      "reward": 0.926367998123169,
      "reward_std": 0.01877889409661293,
      "rewards/reward_func/mean": 0.926367998123169,
      "rewards/reward_func/std": 0.018778905272483826,
      "step": 168,
      "step_time": 19.92407266050577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 377.0,
      "completions/max_terminated_length": 377.0,
      "completions/mean_length": 236.5625,
      "completions/mean_terminated_length": 236.5625,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.3746219798922539,
      "epoch": 0.007827698008337193,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05858612433075905,
      "kl": 0.0010736614640336484,
      "learning_rate": 9.984437239462715e-07,
      "loss": -0.1012,
      "num_tokens": 4772556.0,
      "reward": 0.3508151173591614,
      "reward_std": 0.46775349974632263,
      "rewards/reward_func/mean": 0.3508151173591614,
      "rewards/reward_func/std": 0.46775349974632263,
      "step": 169,
      "step_time": 31.445678532123566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 161.8125,
      "completions/mean_terminated_length": 161.8125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.12431573867797852,
      "epoch": 0.007874015748031496,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10413499921560287,
      "kl": 0.0013005670480197296,
      "learning_rate": 9.984344603983326e-07,
      "loss": -0.0004,
      "num_tokens": 4795081.0,
      "reward": 0.4928065836429596,
      "reward_std": 0.19798803329467773,
      "rewards/reward_func/mean": 0.4928065836429596,
      "rewards/reward_func/std": 0.19798806309700012,
      "step": 170,
      "step_time": 17.126375176012516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 152.4375,
      "completions/mean_terminated_length": 152.4375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.2561582215130329,
      "epoch": 0.007920333487725799,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08171358704566956,
      "kl": 0.001231180940521881,
      "learning_rate": 9.984251968503935e-07,
      "loss": -0.026,
      "num_tokens": 4816800.0,
      "reward": 0.9293943643569946,
      "reward_std": 0.035030219703912735,
      "rewards/reward_func/mean": 0.9293943643569946,
      "rewards/reward_func/std": 0.03503022342920303,
      "step": 171,
      "step_time": 17.46403419226408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 201.0,
      "completions/mean_terminated_length": 201.0,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.4090711176395416,
      "epoch": 0.007966651227420102,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007271178183145821,
      "kl": 0.0014456688368227333,
      "learning_rate": 9.984159333024549e-07,
      "loss": 0.0001,
      "num_tokens": 4845136.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 172,
      "step_time": 22.596364434808493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 149.0,
      "completions/mean_terminated_length": 149.0,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.2239462547004223,
      "epoch": 0.008012968967114405,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07950145751237869,
      "kl": 0.0008892695477697998,
      "learning_rate": 9.98406669754516e-07,
      "loss": -0.0373,
      "num_tokens": 4868096.0,
      "reward": 0.9077697992324829,
      "reward_std": 0.09525498002767563,
      "rewards/reward_func/mean": 0.9077697992324829,
      "rewards/reward_func/std": 0.09525497257709503,
      "step": 173,
      "step_time": 16.582812402397394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.2988496795296669,
      "epoch": 0.008059286706808708,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00046105749788694084,
      "kl": 0.0010157113574678078,
      "learning_rate": 9.98397406206577e-07,
      "loss": 0.0001,
      "num_tokens": 4890696.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 174,
      "step_time": 18.554279018193483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 170.4375,
      "completions/mean_terminated_length": 170.4375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.20798297598958015,
      "epoch": 0.00810560444650301,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003556886513251811,
      "kl": 0.000614775184658356,
      "learning_rate": 9.983881426586382e-07,
      "loss": 0.0,
      "num_tokens": 4927759.0,
      "reward": 0.9167169332504272,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9167169332504272,
      "rewards/reward_func/std": 0.0,
      "step": 175,
      "step_time": 21.02095464617014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 289.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 265.75,
      "completions/mean_terminated_length": 265.75,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "entropy": 0.22096781060099602,
      "epoch": 0.008151922186197313,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.052729543298482895,
      "kl": 0.0010106703994097188,
      "learning_rate": 9.983788791106994e-07,
      "loss": -0.0108,
      "num_tokens": 4967035.0,
      "reward": 0.9709692001342773,
      "reward_std": 0.014403305016458035,
      "rewards/reward_func/mean": 0.9709692001342773,
      "rewards/reward_func/std": 0.014403297565877438,
      "step": 176,
      "step_time": 28.388985190540552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 170.1875,
      "completions/mean_terminated_length": 170.1875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.17096561938524246,
      "epoch": 0.008198239925891616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004381695471238345,
      "kl": 0.000727768667275086,
      "learning_rate": 9.983696155627605e-07,
      "loss": 0.0,
      "num_tokens": 4992398.0,
      "reward": 0.9574533700942993,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9574533700942993,
      "rewards/reward_func/std": 0.0,
      "step": 177,
      "step_time": 18.1888774856925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 130.6875,
      "completions/mean_terminated_length": 130.6875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.18110191822052002,
      "epoch": 0.00824455766558592,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11362199485301971,
      "kl": 0.0009543791529722512,
      "learning_rate": 9.983603520148216e-07,
      "loss": -0.0196,
      "num_tokens": 5029049.0,
      "reward": 0.29507988691329956,
      "reward_std": 0.032220497727394104,
      "rewards/reward_func/mean": 0.29507988691329956,
      "rewards/reward_func/std": 0.032220497727394104,
      "step": 178,
      "step_time": 18.264590088278055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 112.0625,
      "completions/mean_terminated_length": 112.0625,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.267382450401783,
      "epoch": 0.008290875405280222,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001289795502088964,
      "kl": 0.0013163190451450646,
      "learning_rate": 9.983510884668827e-07,
      "loss": 0.0001,
      "num_tokens": 5049274.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 179,
      "step_time": 13.03090962767601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 141.4375,
      "completions/mean_terminated_length": 141.4375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.30875954031944275,
      "epoch": 0.008337193144974525,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009180946508422494,
      "kl": 0.00131464286823757,
      "learning_rate": 9.983418249189439e-07,
      "loss": 0.0001,
      "num_tokens": 5069905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 180,
      "step_time": 14.655602425336838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 128.0625,
      "completions/mean_terminated_length": 128.0625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3586525395512581,
      "epoch": 0.008383510884668828,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005821700324304402,
      "kl": 0.0011948540050070733,
      "learning_rate": 9.98332561371005e-07,
      "loss": 0.0001,
      "num_tokens": 5097506.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 181,
      "step_time": 15.478844940662384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 121.375,
      "completions/mean_terminated_length": 121.375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.33501479029655457,
      "epoch": 0.008429828624363131,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009040228906087577,
      "kl": 0.0010803402110468596,
      "learning_rate": 9.983232978230663e-07,
      "loss": 0.0001,
      "num_tokens": 5117528.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 182,
      "step_time": 14.24662160873413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 172.5,
      "completions/mean_terminated_length": 172.5,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.389257088303566,
      "epoch": 0.008476146364057434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005249987007118762,
      "kl": 0.0011284275096841156,
      "learning_rate": 9.983140342751272e-07,
      "loss": 0.0001,
      "num_tokens": 5152240.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 183,
      "step_time": 21.00098342075944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 172.3125,
      "completions/mean_terminated_length": 172.3125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.35628943890333176,
      "epoch": 0.008522464103751737,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008661292959004641,
      "kl": 0.001361237169476226,
      "learning_rate": 9.983047707271884e-07,
      "loss": 0.0001,
      "num_tokens": 5177477.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 184,
      "step_time": 19.700187604874372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 200.0,
      "completions/mean_terminated_length": 200.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.4047982096672058,
      "epoch": 0.00856878184344604,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19900654256343842,
      "kl": 0.0020328431273810565,
      "learning_rate": 9.982955071792497e-07,
      "loss": -0.0895,
      "num_tokens": 5203637.0,
      "reward": 0.08689714223146439,
      "reward_std": 0.11963946372270584,
      "rewards/reward_func/mean": 0.08689714223146439,
      "rewards/reward_func/std": 0.11963947117328644,
      "step": 185,
      "step_time": 24.889085162431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 111.4375,
      "completions/mean_terminated_length": 111.4375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.27358129620552063,
      "epoch": 0.008615099583140343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012420903658494353,
      "kl": 0.0012404267909005284,
      "learning_rate": 9.982862436313108e-07,
      "loss": 0.0001,
      "num_tokens": 5226348.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 186,
      "step_time": 12.944620177149773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 154.4375,
      "completions/mean_terminated_length": 154.4375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.17678061872720718,
      "epoch": 0.008661417322834646,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007334018009714782,
      "kl": 0.0007218590471893549,
      "learning_rate": 9.98276980083372e-07,
      "loss": 0.0,
      "num_tokens": 5252995.0,
      "reward": 0.9021315574645996,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9021315574645996,
      "rewards/reward_func/std": 0.0,
      "step": 187,
      "step_time": 17.63252827897668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 125.6875,
      "completions/mean_terminated_length": 125.6875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3704119995236397,
      "epoch": 0.008707735062528948,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001009262283332646,
      "kl": 0.001398083579260856,
      "learning_rate": 9.98267716535433e-07,
      "loss": 0.0001,
      "num_tokens": 5275918.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 188,
      "step_time": 15.064791101962328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 212.875,
      "completions/mean_terminated_length": 212.875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.22819731384515762,
      "epoch": 0.008754052802223251,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06821474432945251,
      "kl": 0.0008659957093186677,
      "learning_rate": 9.982584529874942e-07,
      "loss": 0.0372,
      "num_tokens": 5301100.0,
      "reward": 0.6976216435432434,
      "reward_std": 0.1710186004638672,
      "rewards/reward_func/mean": 0.6976216435432434,
      "rewards/reward_func/std": 0.171018585562706,
      "step": 189,
      "step_time": 23.242990478873253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 133.5625,
      "completions/mean_terminated_length": 133.5625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.28962938487529755,
      "epoch": 0.008800370541917554,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040439129807055,
      "kl": 0.002571428776718676,
      "learning_rate": 9.982491894395553e-07,
      "loss": 0.0001,
      "num_tokens": 5321557.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 190,
      "step_time": 13.96716882660985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 128.3125,
      "completions/mean_terminated_length": 128.3125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.29960859566926956,
      "epoch": 0.008846688281611857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010171140311285853,
      "kl": 0.0013723817246500403,
      "learning_rate": 9.982399258916164e-07,
      "loss": 0.0001,
      "num_tokens": 5342570.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 191,
      "step_time": 15.181181944906712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 166.875,
      "completions/mean_terminated_length": 166.875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.3717504292726517,
      "epoch": 0.00889300602130616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006592174177058041,
      "kl": 0.0011195582919754088,
      "learning_rate": 9.982306623436776e-07,
      "loss": 0.0001,
      "num_tokens": 5373624.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 192,
      "step_time": 20.758096884936094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 138.375,
      "completions/mean_terminated_length": 138.375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.23235268890857697,
      "epoch": 0.008939323761000463,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09054431319236755,
      "kl": 0.0010597734071779996,
      "learning_rate": 9.982213987957387e-07,
      "loss": -0.037,
      "num_tokens": 5393982.0,
      "reward": 0.9185318946838379,
      "reward_std": 0.021724820137023926,
      "rewards/reward_func/mean": 0.9185318946838379,
      "rewards/reward_func/std": 0.021724820137023926,
      "step": 193,
      "step_time": 16.371366318315268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 217.6875,
      "completions/mean_terminated_length": 217.6875,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "entropy": 0.1963135525584221,
      "epoch": 0.008985641500694766,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013173865154385567,
      "kl": 0.00090785016072914,
      "learning_rate": 9.982121352477998e-07,
      "loss": 0.0,
      "num_tokens": 5430777.0,
      "reward": 0.7428231239318848,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7428231239318848,
      "rewards/reward_func/std": 0.0,
      "step": 194,
      "step_time": 23.588253416121006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 174.6875,
      "completions/mean_terminated_length": 174.6875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.22125248238444328,
      "epoch": 0.009031959240389069,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005526018794625998,
      "kl": 0.0008960505801951513,
      "learning_rate": 9.98202871699861e-07,
      "loss": 0.0,
      "num_tokens": 5461652.0,
      "reward": 0.5712090730667114,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5712090730667114,
      "rewards/reward_func/std": 0.0,
      "step": 195,
      "step_time": 19.9322307407856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 161.8125,
      "completions/mean_terminated_length": 161.8125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3360860273241997,
      "epoch": 0.009078276980083372,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012191702844575047,
      "kl": 0.0012908496137242764,
      "learning_rate": 9.98193608151922e-07,
      "loss": 0.0001,
      "num_tokens": 5483729.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 196,
      "step_time": 18.82015247270465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 187.6875,
      "completions/mean_terminated_length": 187.6875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.1839112527668476,
      "epoch": 0.009124594719777675,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014125570887699723,
      "kl": 0.0009949714440153912,
      "learning_rate": 9.981843446039832e-07,
      "loss": 0.0001,
      "num_tokens": 5505436.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 197,
      "step_time": 18.444256361573935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3111222982406616,
      "epoch": 0.009170912459471978,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001630751183256507,
      "kl": 0.0015696244081482291,
      "learning_rate": 9.981750810560445e-07,
      "loss": 0.0001,
      "num_tokens": 5528837.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 198,
      "step_time": 16.388581547886133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 154.75,
      "completions/mean_terminated_length": 154.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.14439477026462555,
      "epoch": 0.00921723019916628,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11620976775884628,
      "kl": 0.0008143085142364725,
      "learning_rate": 9.981658175081056e-07,
      "loss": 0.0301,
      "num_tokens": 5559649.0,
      "reward": 0.9239631295204163,
      "reward_std": 0.029681755229830742,
      "rewards/reward_func/mean": 0.9239631295204163,
      "rewards/reward_func/std": 0.02968175709247589,
      "step": 199,
      "step_time": 17.773583106696606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 165.625,
      "completions/mean_terminated_length": 165.625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.16612479090690613,
      "epoch": 0.009263547938860583,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07665696740150452,
      "kl": 0.00095115571457427,
      "learning_rate": 9.981565539601668e-07,
      "loss": 0.0186,
      "num_tokens": 5583899.0,
      "reward": 0.9961940050125122,
      "reward_std": 0.015223884023725986,
      "rewards/reward_func/mean": 0.9961940050125122,
      "rewards/reward_func/std": 0.015223890542984009,
      "step": 200,
      "step_time": 18.268201805651188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 117.0,
      "completions/mean_terminated_length": 117.0,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.31366194784641266,
      "epoch": 0.009309865678554886,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011350114364176989,
      "kl": 0.001548691448988393,
      "learning_rate": 9.981472904122277e-07,
      "loss": 0.0001,
      "num_tokens": 5604971.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 201,
      "step_time": 13.308431796729565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 181.25,
      "completions/mean_terminated_length": 181.25,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.19328206777572632,
      "epoch": 0.00935618341824919,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00047764976625330746,
      "kl": 0.0007238638063427061,
      "learning_rate": 9.98138026864289e-07,
      "loss": 0.0,
      "num_tokens": 5657343.0,
      "reward": 0.7860752940177917,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7860752940177917,
      "rewards/reward_func/std": 0.0,
      "step": 202,
      "step_time": 25.874954532831907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 365.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 315.0625,
      "completions/mean_terminated_length": 315.0625,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "entropy": 0.2514698840677738,
      "epoch": 0.009402501157943492,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04165821149945259,
      "kl": 0.0008549775666324422,
      "learning_rate": 9.981287633163501e-07,
      "loss": -0.1428,
      "num_tokens": 5696496.0,
      "reward": 0.6085189580917358,
      "reward_std": 0.48871079087257385,
      "rewards/reward_func/mean": 0.6085189580917358,
      "rewards/reward_func/std": 0.48871076107025146,
      "step": 203,
      "step_time": 33.79615079984069
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 129.625,
      "completions/mean_terminated_length": 129.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.27936455607414246,
      "epoch": 0.009448818897637795,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006951396935619414,
      "kl": 0.0011286622902844101,
      "learning_rate": 9.981194997684113e-07,
      "loss": 0.0001,
      "num_tokens": 5719594.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 204,
      "step_time": 14.518286500126123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 150.0,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3197743594646454,
      "epoch": 0.009495136637332098,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010948445415124297,
      "kl": 0.001026401572744362,
      "learning_rate": 9.981102362204724e-07,
      "loss": 0.0001,
      "num_tokens": 5743658.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 205,
      "step_time": 16.007482074201107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 165.125,
      "completions/mean_terminated_length": 165.125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2688465192914009,
      "epoch": 0.009541454377026401,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06209837272763252,
      "kl": 0.000768767815316096,
      "learning_rate": 9.981009726725335e-07,
      "loss": -0.0073,
      "num_tokens": 5769948.0,
      "reward": 0.9185318946838379,
      "reward_std": 0.021724820137023926,
      "rewards/reward_func/mean": 0.9185318946838379,
      "rewards/reward_func/std": 0.021724820137023926,
      "step": 206,
      "step_time": 18.914034850895405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 185.0625,
      "completions/mean_terminated_length": 185.0625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.20774800330400467,
      "epoch": 0.009587772116720704,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007211709744296968,
      "kl": 0.0009327216830570251,
      "learning_rate": 9.980917091245946e-07,
      "loss": 0.0,
      "num_tokens": 5798461.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 207,
      "step_time": 19.299127969890833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 151.0625,
      "completions/mean_terminated_length": 151.0625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.25372298434376717,
      "epoch": 0.009634089856415007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00082414411008358,
      "kl": 0.0011023855477105826,
      "learning_rate": 9.980824455766558e-07,
      "loss": 0.0001,
      "num_tokens": 5819390.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 208,
      "step_time": 15.844503808766603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 111.6875,
      "completions/mean_terminated_length": 111.6875,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "entropy": 0.28323476761579514,
      "epoch": 0.00968040759610931,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009210107382386923,
      "kl": 0.0013738409033976495,
      "learning_rate": 9.98073182028717e-07,
      "loss": 0.0001,
      "num_tokens": 5839865.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 209,
      "step_time": 12.719179343432188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 199.4375,
      "completions/mean_terminated_length": 199.4375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.1765754520893097,
      "epoch": 0.009726725335803613,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06527356803417206,
      "kl": 0.0009415536333108321,
      "learning_rate": 9.98063918480778e-07,
      "loss": -0.0021,
      "num_tokens": 5863616.0,
      "reward": 0.9642957448959351,
      "reward_std": 0.028563430532813072,
      "rewards/reward_func/mean": 0.9642957448959351,
      "rewards/reward_func/std": 0.028563441708683968,
      "step": 210,
      "step_time": 20.758552063256502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 180.5625,
      "completions/mean_terminated_length": 180.5625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.29664914309978485,
      "epoch": 0.009773043075497916,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0858040302991867,
      "kl": 0.0014906687720213085,
      "learning_rate": 9.980546549328391e-07,
      "loss": 0.0008,
      "num_tokens": 5891945.0,
      "reward": 0.14649486541748047,
      "reward_std": 0.1612107902765274,
      "rewards/reward_func/mean": 0.14649486541748047,
      "rewards/reward_func/std": 0.1612107902765274,
      "step": 211,
      "step_time": 20.227817099541426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 151.625,
      "completions/mean_terminated_length": 151.625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.13642069324851036,
      "epoch": 0.009819360815192218,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004876737657468766,
      "kl": 0.0006350057010422461,
      "learning_rate": 9.980453913849005e-07,
      "loss": 0.0,
      "num_tokens": 5932915.0,
      "reward": 0.19180183112621307,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.19180183112621307,
      "rewards/reward_func/std": 0.0,
      "step": 212,
      "step_time": 21.081323496997356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 125.0,
      "completions/mean_terminated_length": 125.0,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.1973336823284626,
      "epoch": 0.009865678554886521,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011800038628280163,
      "kl": 0.0010994782787747681,
      "learning_rate": 9.980361278369616e-07,
      "loss": 0.0001,
      "num_tokens": 5952339.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 213,
      "step_time": 15.241963744163513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 203.0,
      "completions/mean_terminated_length": 203.0,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.22050614282488823,
      "epoch": 0.009911996294580824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08256851136684418,
      "kl": 0.0009374968358315527,
      "learning_rate": 9.980268642890225e-07,
      "loss": 0.0,
      "num_tokens": 5991779.0,
      "reward": 0.42199599742889404,
      "reward_std": 0.015191474929451942,
      "rewards/reward_func/mean": 0.42199599742889404,
      "rewards/reward_func/std": 0.015191479586064816,
      "step": 214,
      "step_time": 24.130947835743427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 188.1875,
      "completions/mean_terminated_length": 188.1875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.33391566574573517,
      "epoch": 0.009958314034275127,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005107586039230227,
      "kl": 0.0011112367501482368,
      "learning_rate": 9.980176007410839e-07,
      "loss": 0.0001,
      "num_tokens": 6025622.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 215,
      "step_time": 24.09608830884099
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 126.4375,
      "completions/mean_terminated_length": 126.4375,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.23009155690670013,
      "epoch": 0.01000463177396943,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006373866344802082,
      "kl": 0.0009257519559469074,
      "learning_rate": 9.98008337193145e-07,
      "loss": 0.0,
      "num_tokens": 6045117.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 216,
      "step_time": 14.371771406382322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 154.875,
      "completions/mean_terminated_length": 154.875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.23760268464684486,
      "epoch": 0.010050949513663733,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010695838136598468,
      "kl": 0.0011337489413563162,
      "learning_rate": 9.979990736452061e-07,
      "loss": 0.0001,
      "num_tokens": 6066523.0,
      "reward": 0.7446697354316711,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7446697354316711,
      "rewards/reward_func/std": 0.0,
      "step": 217,
      "step_time": 17.035848531872034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 183.375,
      "completions/mean_terminated_length": 183.375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3174886256456375,
      "epoch": 0.010097267253358036,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005763350054621696,
      "kl": 0.000995459602563642,
      "learning_rate": 9.979898100972672e-07,
      "loss": 0.0,
      "num_tokens": 6104593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 218,
      "step_time": 22.565702576190233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 216.625,
      "completions/mean_terminated_length": 216.625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.37874598801136017,
      "epoch": 0.010143584993052339,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06810466200113297,
      "kl": 0.0009915903065120801,
      "learning_rate": 9.979805465493284e-07,
      "loss": 0.0573,
      "num_tokens": 6140043.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 219,
      "step_time": 26.506840966641903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 312.0,
      "completions/max_terminated_length": 312.0,
      "completions/mean_length": 221.75,
      "completions/mean_terminated_length": 221.75,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.34261541068553925,
      "epoch": 0.010189902732746642,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.060994748026132584,
      "kl": 0.0011211848468519747,
      "learning_rate": 9.979712830013895e-07,
      "loss": 0.0147,
      "num_tokens": 6163463.0,
      "reward": 0.3214479684829712,
      "reward_std": 0.3117648959159851,
      "rewards/reward_func/mean": 0.3214479684829712,
      "rewards/reward_func/std": 0.3117648959159851,
      "step": 220,
      "step_time": 26.26476990059018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.339463971555233,
      "epoch": 0.010236220472440945,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001397659070789814,
      "kl": 0.0020507503650151193,
      "learning_rate": 9.979620194534506e-07,
      "loss": 0.0001,
      "num_tokens": 6188338.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 221,
      "step_time": 16.402330487966537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 161.125,
      "completions/mean_terminated_length": 161.125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.20277191698551178,
      "epoch": 0.010282538212135248,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009595077717676759,
      "kl": 0.001125899056205526,
      "learning_rate": 9.979527559055117e-07,
      "loss": 0.0001,
      "num_tokens": 6223380.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 222,
      "step_time": 21.72225385531783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.43114665895700455,
      "epoch": 0.01032885595182955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001142045366577804,
      "kl": 0.0017447507416363806,
      "learning_rate": 9.979434923575729e-07,
      "loss": 0.0001,
      "num_tokens": 6251513.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 223,
      "step_time": 19.1215389855206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 182.625,
      "completions/mean_terminated_length": 182.625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3638230934739113,
      "epoch": 0.010375173691523853,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011755140731111169,
      "kl": 0.0018604571232572198,
      "learning_rate": 9.97934228809634e-07,
      "loss": 0.0001,
      "num_tokens": 6278355.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 224,
      "step_time": 19.442218646407127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 248.3125,
      "completions/mean_terminated_length": 248.3125,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.22825674712657928,
      "epoch": 0.010421491431218156,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0447196401655674,
      "kl": 0.0009762654372025281,
      "learning_rate": 9.979249652616953e-07,
      "loss": -0.0198,
      "num_tokens": 6317448.0,
      "reward": 0.7119600772857666,
      "reward_std": 0.026777047663927078,
      "rewards/reward_func/mean": 0.7119600772857666,
      "rewards/reward_func/std": 0.026777038350701332,
      "step": 225,
      "step_time": 29.166859570890665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 118.6875,
      "completions/mean_terminated_length": 118.6875,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 0.38523825258016586,
      "epoch": 0.01046780917091246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000769888749346137,
      "kl": 0.001298567367484793,
      "learning_rate": 9.979157017137562e-07,
      "loss": 0.0001,
      "num_tokens": 6344451.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 226,
      "step_time": 15.558392085134983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 123.5,
      "completions/mean_terminated_length": 123.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.22717266157269478,
      "epoch": 0.010514126910606762,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000636727549135685,
      "kl": 0.0008928571623982862,
      "learning_rate": 9.979064381658174e-07,
      "loss": 0.0,
      "num_tokens": 6364075.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 227,
      "step_time": 13.442558009177446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 174.5625,
      "completions/mean_terminated_length": 174.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.44085489213466644,
      "epoch": 0.010560444650301065,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010319107677787542,
      "kl": 0.0018189146649092436,
      "learning_rate": 9.978971746178787e-07,
      "loss": 0.0001,
      "num_tokens": 6415092.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 228,
      "step_time": 25.613100692629814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 125.25,
      "completions/mean_terminated_length": 125.25,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2653019055724144,
      "epoch": 0.010606762389995368,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001061088521964848,
      "kl": 0.0015032147348392755,
      "learning_rate": 9.978879110699398e-07,
      "loss": 0.0001,
      "num_tokens": 6450776.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 229,
      "step_time": 17.40074209868908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 103.3125,
      "completions/mean_terminated_length": 103.3125,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.26146209239959717,
      "epoch": 0.010653080129689671,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001286666956730187,
      "kl": 0.001344879623502493,
      "learning_rate": 9.97878647522001e-07,
      "loss": 0.0001,
      "num_tokens": 6470685.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 230,
      "step_time": 12.195948231965303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 188.5625,
      "completions/mean_terminated_length": 188.5625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3215632736682892,
      "epoch": 0.010699397869383974,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14584235846996307,
      "kl": 0.0015411792846862227,
      "learning_rate": 9.97869383974062e-07,
      "loss": 0.0219,
      "num_tokens": 6508198.0,
      "reward": 0.45201778411865234,
      "reward_std": 0.12053807079792023,
      "rewards/reward_func/mean": 0.45201778411865234,
      "rewards/reward_func/std": 0.12053807824850082,
      "step": 231,
      "step_time": 24.014050632715225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 144.75,
      "completions/mean_terminated_length": 144.75,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.34048623591661453,
      "epoch": 0.010745715609078277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010352181270718575,
      "kl": 0.001357596556772478,
      "learning_rate": 9.978601204261232e-07,
      "loss": 0.0001,
      "num_tokens": 6534802.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 232,
      "step_time": 17.13857477903366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.42318225651979446,
      "epoch": 0.01079203334877258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006775745423510671,
      "kl": 0.001376514817820862,
      "learning_rate": 9.978508568781843e-07,
      "loss": 0.0001,
      "num_tokens": 6562758.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 233,
      "step_time": 19.38410897180438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 181.6875,
      "completions/mean_terminated_length": 181.6875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2940324395895004,
      "epoch": 0.010838351088466883,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004660946724470705,
      "kl": 0.0008864189876476303,
      "learning_rate": 9.978415933302454e-07,
      "loss": 0.0,
      "num_tokens": 6586465.0,
      "reward": 0.20764601230621338,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.20764601230621338,
      "rewards/reward_func/std": 0.0,
      "step": 234,
      "step_time": 22.67731310427189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 133.125,
      "completions/mean_terminated_length": 133.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2967539206147194,
      "epoch": 0.010884668828161186,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027921830769628286,
      "kl": 0.0015929857036098838,
      "learning_rate": 9.978323297823066e-07,
      "loss": 0.0001,
      "num_tokens": 6616963.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 235,
      "step_time": 16.60131949931383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 122.875,
      "completions/mean_terminated_length": 122.875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2843145579099655,
      "epoch": 0.010930986567855489,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012012934312224388,
      "kl": 0.0016003412602003664,
      "learning_rate": 9.978230662343677e-07,
      "loss": 0.0001,
      "num_tokens": 6636545.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 236,
      "step_time": 14.213252019137144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 969.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 263.125,
      "completions/mean_terminated_length": 263.125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.4105694890022278,
      "epoch": 0.010977304307549791,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05327938124537468,
      "kl": 0.0012937210267409682,
      "learning_rate": 9.978138026864288e-07,
      "loss": 0.5985,
      "num_tokens": 6670723.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 237,
      "step_time": 75.6249905526638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 117.0,
      "completions/mean_terminated_length": 117.0,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2223428264260292,
      "epoch": 0.011023622047244094,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013035659212619066,
      "kl": 0.0011440668313298374,
      "learning_rate": 9.9780453913849e-07,
      "loss": 0.0001,
      "num_tokens": 6690563.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 238,
      "step_time": 12.680224448442459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 186.125,
      "completions/mean_terminated_length": 186.125,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3829054981470108,
      "epoch": 0.011069939786938397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005088613252155483,
      "kl": 0.001057111716363579,
      "learning_rate": 9.97795275590551e-07,
      "loss": 0.0001,
      "num_tokens": 6712389.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 239,
      "step_time": 19.046410162001848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 140.25,
      "completions/mean_terminated_length": 140.25,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.34127364307641983,
      "epoch": 0.0111162575266327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007454964215867221,
      "kl": 0.0012433588854037225,
      "learning_rate": 9.977860120426122e-07,
      "loss": 0.0001,
      "num_tokens": 6732457.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 240,
      "step_time": 14.262619711458683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.18717513233423233,
      "epoch": 0.011162575266327003,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08767064660787582,
      "kl": 0.0008982947038020939,
      "learning_rate": 9.977767484946733e-07,
      "loss": -0.07,
      "num_tokens": 6760348.0,
      "reward": 0.28112655878067017,
      "reward_std": 0.015021427534520626,
      "rewards/reward_func/mean": 0.28112655878067017,
      "rewards/reward_func/std": 0.015021426603198051,
      "step": 241,
      "step_time": 19.440937858074903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 217.875,
      "completions/mean_terminated_length": 217.875,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.33736418187618256,
      "epoch": 0.011208893006021306,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16001756489276886,
      "kl": 0.001747905946103856,
      "learning_rate": 9.977674849467347e-07,
      "loss": -0.0886,
      "num_tokens": 6793562.0,
      "reward": 0.37241876125335693,
      "reward_std": 0.4516417384147644,
      "rewards/reward_func/mean": 0.37241876125335693,
      "rewards/reward_func/std": 0.4516417384147644,
      "step": 242,
      "step_time": 24.757107455283403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 178.3125,
      "completions/mean_terminated_length": 178.3125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3585934340953827,
      "epoch": 0.011255210745715609,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005876480718143284,
      "kl": 0.0012372637866064906,
      "learning_rate": 9.977582213987958e-07,
      "loss": 0.0001,
      "num_tokens": 6823503.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 243,
      "step_time": 19.782935816794634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.24060847610235214,
      "epoch": 0.011301528485409912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005929553299210966,
      "kl": 0.0009411770151928067,
      "learning_rate": 9.97748957850857e-07,
      "loss": 0.0,
      "num_tokens": 6844905.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 244,
      "step_time": 14.155857503414154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 169.6875,
      "completions/mean_terminated_length": 169.6875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3826494663953781,
      "epoch": 0.011347846225104215,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006471690139733255,
      "kl": 0.0013720959541387856,
      "learning_rate": 9.97739694302918e-07,
      "loss": 0.0001,
      "num_tokens": 6870548.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 245,
      "step_time": 18.97964360564947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 132.0,
      "completions/mean_terminated_length": 132.0,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2920665517449379,
      "epoch": 0.011394163964798518,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008388461428694427,
      "kl": 0.0009667183912824839,
      "learning_rate": 9.977304307549792e-07,
      "loss": 0.0,
      "num_tokens": 6902932.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 246,
      "step_time": 17.692711248993874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 169.9375,
      "completions/mean_terminated_length": 169.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2462501898407936,
      "epoch": 0.01144048170449282,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006977981538511813,
      "kl": 0.0010372063552495092,
      "learning_rate": 9.977211672070403e-07,
      "loss": 0.0001,
      "num_tokens": 6926275.0,
      "reward": 0.8781879544258118,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8781879544258118,
      "rewards/reward_func/std": 0.0,
      "step": 247,
      "step_time": 17.286343712359667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 132.25,
      "completions/mean_terminated_length": 132.25,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2867204025387764,
      "epoch": 0.011486799444187124,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008923127315938473,
      "kl": 0.0012497162679210305,
      "learning_rate": 9.977119036591014e-07,
      "loss": 0.0001,
      "num_tokens": 6947207.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 248,
      "step_time": 14.922904722392559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 134.5625,
      "completions/mean_terminated_length": 134.5625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3154388293623924,
      "epoch": 0.011533117183881426,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008344214293174446,
      "kl": 0.0013363700127229095,
      "learning_rate": 9.977026401111625e-07,
      "loss": 0.0001,
      "num_tokens": 6983040.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 249,
      "step_time": 18.25295503437519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 125.1875,
      "completions/mean_terminated_length": 125.1875,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2925817146897316,
      "epoch": 0.01157943492357573,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024473611265420914,
      "kl": 0.001579622970893979,
      "learning_rate": 9.976933765632237e-07,
      "loss": 0.0001,
      "num_tokens": 7004899.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 250,
      "step_time": 14.662612289190292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 203.8125,
      "completions/mean_terminated_length": 203.8125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3871491327881813,
      "epoch": 0.011625752663270032,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06019943207502365,
      "kl": 0.0012392482603900135,
      "learning_rate": 9.976841130152848e-07,
      "loss": 0.0422,
      "num_tokens": 7029920.0,
      "reward": 0.17361781001091003,
      "reward_std": 0.3732667565345764,
      "rewards/reward_func/mean": 0.17361781001091003,
      "rewards/reward_func/std": 0.3732668161392212,
      "step": 251,
      "step_time": 24.29412142932415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 119.625,
      "completions/mean_terminated_length": 119.625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2839593477547169,
      "epoch": 0.011672070402964335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010288519551977515,
      "kl": 0.001156790676759556,
      "learning_rate": 9.97674849467346e-07,
      "loss": 0.0001,
      "num_tokens": 7050618.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 252,
      "step_time": 13.062179304659367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 118.9375,
      "completions/mean_terminated_length": 118.9375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.24643199145793915,
      "epoch": 0.011718388142658638,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000641032587736845,
      "kl": 0.0011265594221185893,
      "learning_rate": 9.97665585919407e-07,
      "loss": 0.0001,
      "num_tokens": 7073193.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 253,
      "step_time": 14.38772228360176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 187.9375,
      "completions/mean_terminated_length": 187.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3889130726456642,
      "epoch": 0.011764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07069052755832672,
      "kl": 0.0013481812202371657,
      "learning_rate": 9.976563223714682e-07,
      "loss": -0.0847,
      "num_tokens": 7096248.0,
      "reward": 0.11693838238716125,
      "reward_std": 0.3195364773273468,
      "rewards/reward_func/mean": 0.11693838238716125,
      "rewards/reward_func/std": 0.3195364773273468,
      "step": 254,
      "step_time": 20.836565881967545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 121.9375,
      "completions/mean_terminated_length": 121.9375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2889237403869629,
      "epoch": 0.011811023622047244,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007356893620453775,
      "kl": 0.0010288926860084757,
      "learning_rate": 9.976470588235295e-07,
      "loss": 0.0001,
      "num_tokens": 7117655.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 255,
      "step_time": 14.183229140937328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 146.75,
      "completions/mean_terminated_length": 146.75,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.4106031656265259,
      "epoch": 0.011857341361741547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008677243022248149,
      "kl": 0.0016055423475336283,
      "learning_rate": 9.976377952755906e-07,
      "loss": 0.0001,
      "num_tokens": 7151011.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 256,
      "step_time": 19.09181548282504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 110.125,
      "completions/mean_terminated_length": 110.125,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.2853569909930229,
      "epoch": 0.01190365910143585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014247329672798514,
      "kl": 0.0015982910699676722,
      "learning_rate": 9.976285317276515e-07,
      "loss": 0.0001,
      "num_tokens": 7170469.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 257,
      "step_time": 12.658961690962315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 159.875,
      "completions/mean_terminated_length": 159.875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3763571232557297,
      "epoch": 0.011949976841130153,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006330300820991397,
      "kl": 0.0013355592382140458,
      "learning_rate": 9.976192681797129e-07,
      "loss": 0.0001,
      "num_tokens": 7202131.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 258,
      "step_time": 18.526979483664036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 162.1875,
      "completions/mean_terminated_length": 162.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4172009825706482,
      "epoch": 0.011996294580824456,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007413196726702154,
      "kl": 0.0014627919590566307,
      "learning_rate": 9.97610004631774e-07,
      "loss": 0.0001,
      "num_tokens": 7236982.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 259,
      "step_time": 19.692689403891563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 195.375,
      "completions/mean_terminated_length": 195.375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.3919227793812752,
      "epoch": 0.012042612320518759,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08192694187164307,
      "kl": 0.0018703019595704973,
      "learning_rate": 9.976007410838351e-07,
      "loss": 0.006,
      "num_tokens": 7258924.0,
      "reward": 0.3125,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.3125,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 260,
      "step_time": 19.39155102148652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 289.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 204.5625,
      "completions/mean_terminated_length": 204.5625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.43711964786052704,
      "epoch": 0.012088930060213061,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0016617421060800552,
      "kl": 0.0020323360804468393,
      "learning_rate": 9.975914775358962e-07,
      "loss": -0.0003,
      "num_tokens": 7283941.0,
      "reward": 1.378031839749383e-07,
      "reward_std": 5.512127358997532e-07,
      "rewards/reward_func/mean": 1.378031839749383e-07,
      "rewards/reward_func/std": 5.512127358997532e-07,
      "step": 261,
      "step_time": 24.59879645705223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 179.0625,
      "completions/mean_terminated_length": 179.0625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.14695752039551735,
      "epoch": 0.012135247799907364,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004047244437970221,
      "kl": 0.0005786035471828654,
      "learning_rate": 9.975822139879574e-07,
      "loss": 0.0,
      "num_tokens": 7307798.0,
      "reward": 0.8172460198402405,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8172460198402405,
      "rewards/reward_func/std": 0.0,
      "step": 262,
      "step_time": 19.629223205149174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3128410279750824,
      "epoch": 0.012181565539601667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019142779055982828,
      "kl": 0.0016148071445059031,
      "learning_rate": 9.975729504400185e-07,
      "loss": 0.0001,
      "num_tokens": 7331632.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 263,
      "step_time": 14.436136823147535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 192.8125,
      "completions/mean_terminated_length": 192.8125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.18640683591365814,
      "epoch": 0.01222788327929597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004958834615536034,
      "kl": 0.0007326693012146279,
      "learning_rate": 9.975636868920796e-07,
      "loss": 0.0,
      "num_tokens": 7385101.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 264,
      "step_time": 27.704222440719604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 107.3125,
      "completions/mean_terminated_length": 107.3125,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.24617928266525269,
      "epoch": 0.012274201018990273,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006484591867774725,
      "kl": 0.0009265294938813895,
      "learning_rate": 9.975544233441407e-07,
      "loss": 0.0,
      "num_tokens": 7404402.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 265,
      "step_time": 12.239355891942978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 131.8125,
      "completions/mean_terminated_length": 131.8125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.26922178268432617,
      "epoch": 0.012320518758684576,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007232644129544497,
      "kl": 0.0010748645290732384,
      "learning_rate": 9.975451597962019e-07,
      "loss": 0.0001,
      "num_tokens": 7425023.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 266,
      "step_time": 13.867826867848635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 146.75,
      "completions/mean_terminated_length": 146.75,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3577575385570526,
      "epoch": 0.012366836498378879,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008738681790418923,
      "kl": 0.0012663750094361603,
      "learning_rate": 9.97535896248263e-07,
      "loss": 0.0001,
      "num_tokens": 7448555.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 267,
      "step_time": 17.042743027210236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 133.375,
      "completions/mean_terminated_length": 133.375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.21464881673455238,
      "epoch": 0.012413154238073182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010989999864250422,
      "kl": 0.0009238123311661184,
      "learning_rate": 9.975266327003243e-07,
      "loss": 0.0,
      "num_tokens": 7468689.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 268,
      "step_time": 15.706818025559187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 188.5,
      "completions/mean_terminated_length": 188.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.332606703042984,
      "epoch": 0.012459471977767485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013649666216224432,
      "kl": 0.0015467848279513419,
      "learning_rate": 9.975173691523852e-07,
      "loss": 0.0001,
      "num_tokens": 7505561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 269,
      "step_time": 22.800201173871756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 150.8125,
      "completions/mean_terminated_length": 150.8125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.15520111098885536,
      "epoch": 0.012505789717461788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007402778137475252,
      "kl": 0.00071650069730822,
      "learning_rate": 9.975081056044464e-07,
      "loss": 0.0,
      "num_tokens": 7530086.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 270,
      "step_time": 16.660070817917585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 187.125,
      "completions/mean_terminated_length": 187.125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.38500913232564926,
      "epoch": 0.01255210745715609,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001494114869274199,
      "kl": 0.0017256204737350345,
      "learning_rate": 9.974988420565075e-07,
      "loss": 0.0001,
      "num_tokens": 7556344.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 271,
      "step_time": 24.9211747944355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 200.375,
      "completions/mean_terminated_length": 200.375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3571956604719162,
      "epoch": 0.012598425196850394,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005984654417261481,
      "kl": 0.0011825784167740494,
      "learning_rate": 9.974895785085688e-07,
      "loss": 0.0001,
      "num_tokens": 7578334.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 272,
      "step_time": 22.358461305499077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.24652643501758575,
      "epoch": 0.012644742936544696,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016675933729857206,
      "kl": 0.00128700424102135,
      "learning_rate": 9.9748031496063e-07,
      "loss": 0.0001,
      "num_tokens": 7599490.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 273,
      "step_time": 14.426620122045279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.1960732415318489,
      "epoch": 0.012691060676239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004313480167184025,
      "kl": 0.0006078024889575318,
      "learning_rate": 9.97471051412691e-07,
      "loss": 0.0,
      "num_tokens": 7627594.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 274,
      "step_time": 19.362862575799227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 125.8125,
      "completions/mean_terminated_length": 125.8125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2697870396077633,
      "epoch": 0.012737378415933302,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00127905432600528,
      "kl": 0.0012629003031179309,
      "learning_rate": 9.974617878647522e-07,
      "loss": 0.0001,
      "num_tokens": 7651095.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 275,
      "step_time": 15.04151302203536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 126.375,
      "completions/mean_terminated_length": 126.375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.29414065927267075,
      "epoch": 0.012783696155627605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010536487679928541,
      "kl": 0.0011426027631387115,
      "learning_rate": 9.974525243168133e-07,
      "loss": 0.0001,
      "num_tokens": 7670573.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 276,
      "step_time": 13.268250782042742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.29729852825403214,
      "epoch": 0.012830013895321908,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012455489486455917,
      "kl": 0.0014235415437724441,
      "learning_rate": 9.974432607688744e-07,
      "loss": 0.0001,
      "num_tokens": 7692069.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 277,
      "step_time": 14.937441002577543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 151.125,
      "completions/mean_terminated_length": 151.125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.38496117293834686,
      "epoch": 0.012876331635016211,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006876222905702889,
      "kl": 0.001357803848804906,
      "learning_rate": 9.974339972209356e-07,
      "loss": 0.0001,
      "num_tokens": 7746199.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 278,
      "step_time": 24.39612015336752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 153.8125,
      "completions/mean_terminated_length": 153.8125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.4385102689266205,
      "epoch": 0.012922649374710514,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004172384215053171,
      "kl": 0.0011424672266002744,
      "learning_rate": 9.974247336729967e-07,
      "loss": 0.0001,
      "num_tokens": 7774548.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 279,
      "step_time": 18.329782836139202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 205.3125,
      "completions/mean_terminated_length": 205.3125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3062855675816536,
      "epoch": 0.012968967114404817,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020328606478869915,
      "kl": 0.0012164927611593157,
      "learning_rate": 9.974154701250578e-07,
      "loss": 0.0001,
      "num_tokens": 7799625.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 280,
      "step_time": 22.674007039517164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 199.4375,
      "completions/mean_terminated_length": 199.4375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.14915741235017776,
      "epoch": 0.01301528485409912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004247923498041928,
      "kl": 0.000717098475433886,
      "learning_rate": 9.97406206577119e-07,
      "loss": 0.0,
      "num_tokens": 7833552.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 281,
      "step_time": 22.713849186897278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 160.75,
      "completions/mean_terminated_length": 160.75,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3701997995376587,
      "epoch": 0.013061602593793423,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006304908310994506,
      "kl": 0.0010913874139077961,
      "learning_rate": 9.9739694302918e-07,
      "loss": 0.0001,
      "num_tokens": 7854460.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 282,
      "step_time": 16.44683262333274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 154.9375,
      "completions/mean_terminated_length": 154.9375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3792154788970947,
      "epoch": 0.013107920333487726,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010001423070207238,
      "kl": 0.001630262704566121,
      "learning_rate": 9.973876794812412e-07,
      "loss": 0.0001,
      "num_tokens": 7878971.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 283,
      "step_time": 17.895399875938892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 202.875,
      "completions/mean_terminated_length": 202.875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.208527822047472,
      "epoch": 0.013154238073182029,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006816262030042708,
      "kl": 0.0008669499948155135,
      "learning_rate": 9.973784159333023e-07,
      "loss": 0.0,
      "num_tokens": 7908409.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 284,
      "step_time": 23.0435762219131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 171.3125,
      "completions/mean_terminated_length": 171.3125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.39356208592653275,
      "epoch": 0.013200555812876331,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012899527791887522,
      "kl": 0.0014151684008538723,
      "learning_rate": 9.973691523853637e-07,
      "loss": 0.0001,
      "num_tokens": 7941534.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 285,
      "step_time": 19.861068926751614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 162.0,
      "completions/mean_terminated_length": 162.0,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.24289053678512573,
      "epoch": 0.013246873552570634,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07930362224578857,
      "kl": 0.000889660048414953,
      "learning_rate": 9.973598888374248e-07,
      "loss": -0.0088,
      "num_tokens": 7963902.0,
      "reward": 0.8952482342720032,
      "reward_std": 0.03075244091451168,
      "rewards/reward_func/mean": 0.8952482342720032,
      "rewards/reward_func/std": 0.030752435326576233,
      "step": 286,
      "step_time": 17.126583348959684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 210.6875,
      "completions/mean_terminated_length": 210.6875,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.39860232919454575,
      "epoch": 0.013293191292264937,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08467237651348114,
      "kl": 0.0015759188390802592,
      "learning_rate": 9.97350625289486e-07,
      "loss": -0.0881,
      "num_tokens": 7999417.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 287,
      "step_time": 26.797319907695055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 153.125,
      "completions/mean_terminated_length": 153.125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.25086531043052673,
      "epoch": 0.01333950903195924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007963802199810743,
      "kl": 0.001119767752243206,
      "learning_rate": 9.973413617415468e-07,
      "loss": 0.0001,
      "num_tokens": 8019803.0,
      "reward": 0.780767560005188,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.780767560005188,
      "rewards/reward_func/std": 0.0,
      "step": 288,
      "step_time": 15.925499644130468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 120.1875,
      "completions/mean_terminated_length": 120.1875,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.27041003108024597,
      "epoch": 0.013385826771653543,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007142223767004907,
      "kl": 0.001016193040413782,
      "learning_rate": 9.973320981936082e-07,
      "loss": 0.0001,
      "num_tokens": 8042414.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 289,
      "step_time": 14.222120333462954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 236.625,
      "completions/mean_terminated_length": 236.625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "entropy": 0.3460284397006035,
      "epoch": 0.013432144511347846,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.057437408715486526,
      "kl": 0.0011360525531927124,
      "learning_rate": 9.973228346456693e-07,
      "loss": -0.0572,
      "num_tokens": 8079592.0,
      "reward": 0.20799149572849274,
      "reward_std": 0.15038849413394928,
      "rewards/reward_func/mean": 0.20799149572849274,
      "rewards/reward_func/std": 0.15038849413394928,
      "step": 290,
      "step_time": 26.500631351023912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 175.625,
      "completions/mean_terminated_length": 175.625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.20297640562057495,
      "epoch": 0.013478462251042149,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009243394597433507,
      "kl": 0.0009111710969591513,
      "learning_rate": 9.973135710977304e-07,
      "loss": 0.0,
      "num_tokens": 8103234.0,
      "reward": 0.6347364187240601,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6347364187240601,
      "rewards/reward_func/std": 0.0,
      "step": 291,
      "step_time": 18.534159436821938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.39861272275447845,
      "epoch": 0.013524779990736452,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004690671630669385,
      "kl": 0.0011920387914869934,
      "learning_rate": 9.973043075497915e-07,
      "loss": 0.0001,
      "num_tokens": 8130866.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 292,
      "step_time": 20.164982356131077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.36604560166597366,
      "epoch": 0.013571097730430755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008623342146165669,
      "kl": 0.0014578664267901331,
      "learning_rate": 9.972950440018527e-07,
      "loss": 0.0001,
      "num_tokens": 8153020.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 293,
      "step_time": 17.66485656797886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 189.125,
      "completions/mean_terminated_length": 189.125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.41096896678209305,
      "epoch": 0.013617415470125058,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001160994521342218,
      "kl": 0.002139343094313517,
      "learning_rate": 9.972857804539138e-07,
      "loss": 0.0001,
      "num_tokens": 8185294.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 294,
      "step_time": 22.892805226147175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 134.5625,
      "completions/mean_terminated_length": 134.5625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.30392035841941833,
      "epoch": 0.01366373320981936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015385417500510812,
      "kl": 0.0014921021938789636,
      "learning_rate": 9.97276516905975e-07,
      "loss": 0.0001,
      "num_tokens": 8205271.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 295,
      "step_time": 14.012457262724638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 217.125,
      "completions/mean_terminated_length": 217.125,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.24468612298369408,
      "epoch": 0.013710050949513664,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004593682533595711,
      "kl": 0.0008445656712865457,
      "learning_rate": 9.97267253358036e-07,
      "loss": 0.0,
      "num_tokens": 8232601.0,
      "reward": 0.2555498778820038,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.2555498778820038,
      "rewards/reward_func/std": 0.0,
      "step": 296,
      "step_time": 23.20956961810589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 192.3125,
      "completions/mean_terminated_length": 192.3125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.13571469858288765,
      "epoch": 0.013756368689207966,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005670030950568616,
      "kl": 0.0007186480070231482,
      "learning_rate": 9.972579898100972e-07,
      "loss": 0.0,
      "num_tokens": 8257934.0,
      "reward": 0.9375209808349609,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9375209808349609,
      "rewards/reward_func/std": 0.0,
      "step": 297,
      "step_time": 19.63731164112687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 134.1875,
      "completions/mean_terminated_length": 134.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.25146742910146713,
      "epoch": 0.01380268642890227,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010619927197694778,
      "kl": 0.0012564520293381065,
      "learning_rate": 9.972487262621585e-07,
      "loss": 0.0001,
      "num_tokens": 8277809.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 298,
      "step_time": 14.803503945469856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 295.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 207.875,
      "completions/mean_terminated_length": 207.875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.2383860982954502,
      "epoch": 0.013849004168596572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07985582202672958,
      "kl": 0.0009646453545428813,
      "learning_rate": 9.972394627142196e-07,
      "loss": 0.054,
      "num_tokens": 8303631.0,
      "reward": 0.8738229274749756,
      "reward_std": 0.24180229008197784,
      "rewards/reward_func/mean": 0.8738229274749756,
      "rewards/reward_func/std": 0.24180230498313904,
      "step": 299,
      "step_time": 25.31433679163456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 126.1875,
      "completions/mean_terminated_length": 126.1875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2671867311000824,
      "epoch": 0.013895321908290875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011980956187471747,
      "kl": 0.00120316629181616,
      "learning_rate": 9.972301991662805e-07,
      "loss": 0.0001,
      "num_tokens": 8332450.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 300,
      "step_time": 15.832930944859982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 131.75,
      "completions/mean_terminated_length": 131.75,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2855543717741966,
      "epoch": 0.013941639647985178,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007711020880378783,
      "kl": 0.0012660915090236813,
      "learning_rate": 9.972209356183417e-07,
      "loss": 0.0001,
      "num_tokens": 8353486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 301,
      "step_time": 14.268948875367641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 151.5625,
      "completions/mean_terminated_length": 151.5625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4135436415672302,
      "epoch": 0.013987957387679481,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005681651527993381,
      "kl": 0.0010966947593260556,
      "learning_rate": 9.97211672070403e-07,
      "loss": 0.0001,
      "num_tokens": 8385031.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 302,
      "step_time": 18.032295767217875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 187.625,
      "completions/mean_terminated_length": 187.625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.40466737747192383,
      "epoch": 0.014034275127373784,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006597894825972617,
      "kl": 0.0015541419852524996,
      "learning_rate": 9.972024085224641e-07,
      "loss": 0.0001,
      "num_tokens": 8421425.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 303,
      "step_time": 23.78118661046028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 153.125,
      "completions/mean_terminated_length": 153.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.33878885209560394,
      "epoch": 0.014080592867068087,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011138013796880841,
      "kl": 0.001382181013468653,
      "learning_rate": 9.971931449745252e-07,
      "loss": 0.0001,
      "num_tokens": 8443795.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 304,
      "step_time": 16.13924302533269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 161.0625,
      "completions/mean_terminated_length": 161.0625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3237125128507614,
      "epoch": 0.01412691060676239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006071277312003076,
      "kl": 0.0012195921153761446,
      "learning_rate": 9.971838814265864e-07,
      "loss": 0.0001,
      "num_tokens": 8470948.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 305,
      "step_time": 19.79815885797143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 360.0,
      "completions/max_terminated_length": 360.0,
      "completions/mean_length": 211.5,
      "completions/mean_terminated_length": 211.5,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.47993937134742737,
      "epoch": 0.014173228346456693,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07259546965360641,
      "kl": 0.0017538362299092114,
      "learning_rate": 9.971746178786475e-07,
      "loss": 0.1489,
      "num_tokens": 8493404.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 306,
      "step_time": 29.653807297348976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 200.5625,
      "completions/mean_terminated_length": 200.5625,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.2899518199265003,
      "epoch": 0.014219546086150996,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0738273486495018,
      "kl": 0.0010956964979413897,
      "learning_rate": 9.971653543307086e-07,
      "loss": -0.037,
      "num_tokens": 8519717.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 307,
      "step_time": 22.862802632153034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 166.0,
      "completions/mean_terminated_length": 166.0,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.38063275814056396,
      "epoch": 0.014265863825845299,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008101991261355579,
      "kl": 0.0019126106635667384,
      "learning_rate": 9.971560907827697e-07,
      "loss": 0.0001,
      "num_tokens": 8549221.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 308,
      "step_time": 18.756974667310715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 149.4375,
      "completions/mean_terminated_length": 149.4375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.19209444895386696,
      "epoch": 0.014312181565539601,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005579253192991018,
      "kl": 0.000857373554026708,
      "learning_rate": 9.971468272348309e-07,
      "loss": 0.0,
      "num_tokens": 8570252.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 309,
      "step_time": 16.16516475379467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 236.1875,
      "completions/mean_terminated_length": 236.1875,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "entropy": 0.2595669776201248,
      "epoch": 0.014358499305233904,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05039333924651146,
      "kl": 0.0008414801704930142,
      "learning_rate": 9.97137563686892e-07,
      "loss": -0.0361,
      "num_tokens": 8592431.0,
      "reward": 0.6965094804763794,
      "reward_std": 0.18601341545581818,
      "rewards/reward_func/mean": 0.6965094804763794,
      "rewards/reward_func/std": 0.18601341545581818,
      "step": 310,
      "step_time": 22.22887173295021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 339.0,
      "completions/max_terminated_length": 339.0,
      "completions/mean_length": 291.0625,
      "completions/mean_terminated_length": 291.0625,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.2998119741678238,
      "epoch": 0.014404817044928207,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06676234304904938,
      "kl": 0.0010350022348575294,
      "learning_rate": 9.971283001389531e-07,
      "loss": -0.0599,
      "num_tokens": 8632096.0,
      "reward": 0.8322875499725342,
      "reward_std": 0.22784748673439026,
      "rewards/reward_func/mean": 0.8322875499725342,
      "rewards/reward_func/std": 0.22784748673439026,
      "step": 311,
      "step_time": 31.391690842807293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 138.625,
      "completions/mean_terminated_length": 138.625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2928355559706688,
      "epoch": 0.01445113478462251,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009526091744191945,
      "kl": 0.0013767742784693837,
      "learning_rate": 9.971190365910142e-07,
      "loss": 0.0001,
      "num_tokens": 8655738.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 312,
      "step_time": 15.893251542001963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 180.1875,
      "completions/mean_terminated_length": 180.1875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3426474630832672,
      "epoch": 0.014497452524316813,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005128518678247929,
      "kl": 0.0011518497776705772,
      "learning_rate": 9.971097730430754e-07,
      "loss": 0.0001,
      "num_tokens": 8686589.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 313,
      "step_time": 20.093024745583534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 171.125,
      "completions/mean_terminated_length": 171.125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.268556572496891,
      "epoch": 0.014543770264011116,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005823679966852069,
      "kl": 0.0009972721163649112,
      "learning_rate": 9.971005094951365e-07,
      "loss": 0.0001,
      "num_tokens": 8710559.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 314,
      "step_time": 18.503258530050516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 145.6875,
      "completions/mean_terminated_length": 145.6875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.37556004524230957,
      "epoch": 0.014590088003705419,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001303147291764617,
      "kl": 0.0015021058497950435,
      "learning_rate": 9.970912459471978e-07,
      "loss": 0.0001,
      "num_tokens": 8741338.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 315,
      "step_time": 20.34307497739792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 127.6875,
      "completions/mean_terminated_length": 127.6875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2628978341817856,
      "epoch": 0.014636405743399722,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000976155512034893,
      "kl": 0.0014791136200074106,
      "learning_rate": 9.97081982399259e-07,
      "loss": 0.0001,
      "num_tokens": 8761637.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 316,
      "step_time": 14.552318941801786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 209.8125,
      "completions/mean_terminated_length": 209.8125,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.2628732994198799,
      "epoch": 0.014682723483094025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09145770221948624,
      "kl": 0.0013813844125252217,
      "learning_rate": 9.9707271885132e-07,
      "loss": -0.036,
      "num_tokens": 8790562.0,
      "reward": 0.8237026929855347,
      "reward_std": 0.18207906186580658,
      "rewards/reward_func/mean": 0.8237026929855347,
      "rewards/reward_func/std": 0.18207907676696777,
      "step": 317,
      "step_time": 22.769946806132793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 138.6875,
      "completions/mean_terminated_length": 138.6875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3076990395784378,
      "epoch": 0.014729041222788328,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011036385549232364,
      "kl": 0.0017139567353297025,
      "learning_rate": 9.97063455303381e-07,
      "loss": 0.0001,
      "num_tokens": 8811389.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 318,
      "step_time": 15.330391250550747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 139.25,
      "completions/mean_terminated_length": 139.25,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2163013182580471,
      "epoch": 0.01477535896248263,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07880755513906479,
      "kl": 0.0011645381164271384,
      "learning_rate": 9.970541917554423e-07,
      "loss": 0.0183,
      "num_tokens": 8832513.0,
      "reward": 0.8862214088439941,
      "reward_std": 0.04104293882846832,
      "rewards/reward_func/mean": 0.8862214088439941,
      "rewards/reward_func/std": 0.04104295372962952,
      "step": 319,
      "step_time": 15.956313017755747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 159.75,
      "completions/mean_terminated_length": 159.75,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.38351473957300186,
      "epoch": 0.014821676702176934,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007289415807463229,
      "kl": 0.0014589433558285236,
      "learning_rate": 9.970449282075035e-07,
      "loss": 0.0001,
      "num_tokens": 8855965.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 320,
      "step_time": 18.83362502232194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 187.5625,
      "completions/mean_terminated_length": 187.5625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.33954180777072906,
      "epoch": 0.014867994441871236,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08285839110612869,
      "kl": 0.0012769268942065537,
      "learning_rate": 9.970356646595646e-07,
      "loss": 0.0345,
      "num_tokens": 8879094.0,
      "reward": 0.8875277638435364,
      "reward_std": 0.23965653777122498,
      "rewards/reward_func/mean": 0.8875277638435364,
      "rewards/reward_func/std": 0.23965655267238617,
      "step": 321,
      "step_time": 21.82441758364439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 133.9375,
      "completions/mean_terminated_length": 133.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3537861630320549,
      "epoch": 0.01491431218156554,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008833041065372527,
      "kl": 0.0018543840560596436,
      "learning_rate": 9.970264011116257e-07,
      "loss": 0.0001,
      "num_tokens": 8926021.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 322,
      "step_time": 21.40423509478569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 192.375,
      "completions/mean_terminated_length": 192.375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.25622427463531494,
      "epoch": 0.014960629921259842,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05758245661854744,
      "kl": 0.0009644064848544076,
      "learning_rate": 9.970171375636868e-07,
      "loss": 0.0055,
      "num_tokens": 8948891.0,
      "reward": 0.9782751798629761,
      "reward_std": 0.03886253759264946,
      "rewards/reward_func/mean": 0.9782751798629761,
      "rewards/reward_func/std": 0.03886254131793976,
      "step": 323,
      "step_time": 21.875840231776237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 206.3125,
      "completions/mean_terminated_length": 206.3125,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.27018605917692184,
      "epoch": 0.015006947660954145,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006027469644322991,
      "kl": 0.001118411382776685,
      "learning_rate": 9.97007874015748e-07,
      "loss": 0.0001,
      "num_tokens": 8979616.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 324,
      "step_time": 23.039261762052774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 187.125,
      "completions/mean_terminated_length": 187.125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.32924505323171616,
      "epoch": 0.015053265400648448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006592704448848963,
      "kl": 0.001322623051237315,
      "learning_rate": 9.96998610467809e-07,
      "loss": 0.0001,
      "num_tokens": 9036930.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 325,
      "step_time": 28.929474364966154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 177.6875,
      "completions/mean_terminated_length": 177.6875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2153313159942627,
      "epoch": 0.015099583140342751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000623970408923924,
      "kl": 0.0009110145183512941,
      "learning_rate": 9.969893469198702e-07,
      "loss": 0.0,
      "num_tokens": 9057949.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 326,
      "step_time": 17.84788030385971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 134.375,
      "completions/mean_terminated_length": 134.375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.221485685557127,
      "epoch": 0.015145900880037054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003161739557981491,
      "kl": 0.0013649599568452686,
      "learning_rate": 9.969800833719313e-07,
      "loss": 0.0001,
      "num_tokens": 9077795.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 327,
      "step_time": 14.65321834385395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 167.5,
      "completions/mean_terminated_length": 167.5,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.22197071090340614,
      "epoch": 0.015192218619731357,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08835534751415253,
      "kl": 0.0013470058620441705,
      "learning_rate": 9.969708198239927e-07,
      "loss": -0.0439,
      "num_tokens": 9099611.0,
      "reward": 0.5268779397010803,
      "reward_std": 0.06872842460870743,
      "rewards/reward_func/mean": 0.5268779397010803,
      "rewards/reward_func/std": 0.06872842460870743,
      "step": 328,
      "step_time": 17.72628043591976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 189.375,
      "completions/mean_terminated_length": 189.375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.14309130609035492,
      "epoch": 0.01523853635942566,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003652969899121672,
      "kl": 0.0005090439590276219,
      "learning_rate": 9.969615562760538e-07,
      "loss": 0.0,
      "num_tokens": 9134081.0,
      "reward": 0.9607894420623779,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9607894420623779,
      "rewards/reward_func/std": 0.0,
      "step": 329,
      "step_time": 21.34814863279462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 218.0,
      "completions/mean_terminated_length": 218.0,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.4148356765508652,
      "epoch": 0.015284854099119963,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006424064631573856,
      "kl": 0.001229463203344494,
      "learning_rate": 9.96952292728115e-07,
      "loss": 0.0001,
      "num_tokens": 9155745.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 330,
      "step_time": 23.497181992977858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 139.0,
      "completions/mean_terminated_length": 139.0,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.32033444195985794,
      "epoch": 0.015331171838814266,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008130758069455624,
      "kl": 0.0012869780184701085,
      "learning_rate": 9.969430291801758e-07,
      "loss": 0.0001,
      "num_tokens": 9175809.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 331,
      "step_time": 15.519235752522945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 166.25,
      "completions/mean_terminated_length": 166.25,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2561548873782158,
      "epoch": 0.015377489578508569,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09667813777923584,
      "kl": 0.0011559998383745551,
      "learning_rate": 9.969337656322372e-07,
      "loss": 0.0057,
      "num_tokens": 9196565.0,
      "reward": 0.8072022795677185,
      "reward_std": 0.17560791969299316,
      "rewards/reward_func/mean": 0.8072022795677185,
      "rewards/reward_func/std": 0.17560791969299316,
      "step": 332,
      "step_time": 17.828052032738924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 332.0,
      "completions/max_terminated_length": 332.0,
      "completions/mean_length": 243.125,
      "completions/mean_terminated_length": 243.125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.3886435329914093,
      "epoch": 0.015423807318202871,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0727555900812149,
      "kl": 0.0022208106820471585,
      "learning_rate": 9.969245020842983e-07,
      "loss": -0.15,
      "num_tokens": 9230295.0,
      "reward": 0.1661306470632553,
      "reward_std": 0.35719001293182373,
      "rewards/reward_func/mean": 0.1661306470632553,
      "rewards/reward_func/std": 0.35719001293182373,
      "step": 333,
      "step_time": 29.55166383087635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 172.1875,
      "completions/mean_terminated_length": 172.1875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.38166338950395584,
      "epoch": 0.015470125057897174,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001050234423018992,
      "kl": 0.0015453605155926198,
      "learning_rate": 9.969152385363594e-07,
      "loss": 0.0001,
      "num_tokens": 9269242.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 334,
      "step_time": 22.607544537633657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 155.75,
      "completions/mean_terminated_length": 155.75,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.36232052743434906,
      "epoch": 0.015516442797591477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012394188670441508,
      "kl": 0.0012941665190737695,
      "learning_rate": 9.969059749884205e-07,
      "loss": 0.0001,
      "num_tokens": 9295750.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 335,
      "step_time": 18.219283301383257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 174.1875,
      "completions/mean_terminated_length": 174.1875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3761717975139618,
      "epoch": 0.01556276053728578,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016768844798207283,
      "kl": 0.0014611249644076452,
      "learning_rate": 9.968967114404817e-07,
      "loss": 0.0001,
      "num_tokens": 9317465.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 336,
      "step_time": 18.309650901705027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 155.6875,
      "completions/mean_terminated_length": 155.6875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.21403182297945023,
      "epoch": 0.015609078276980083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008231138344854116,
      "kl": 0.0009362594864796847,
      "learning_rate": 9.968874478925428e-07,
      "loss": 0.0,
      "num_tokens": 9346772.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 337,
      "step_time": 19.34696962684393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 171.125,
      "completions/mean_terminated_length": 171.125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.33988913148641586,
      "epoch": 0.015655396016674386,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004363682121038437,
      "kl": 0.002511500206310302,
      "learning_rate": 9.96878184344604e-07,
      "loss": 0.0001,
      "num_tokens": 9372294.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 338,
      "step_time": 19.320641227066517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3785661906003952,
      "epoch": 0.01570171375636869,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015458170091733336,
      "kl": 0.0016905050433706492,
      "learning_rate": 9.96868920796665e-07,
      "loss": 0.0001,
      "num_tokens": 9393844.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 339,
      "step_time": 20.65398971363902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 165.1875,
      "completions/mean_terminated_length": 165.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4234740734100342,
      "epoch": 0.015748031496062992,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007248983602039516,
      "kl": 0.0012707824644166976,
      "learning_rate": 9.968596572487262e-07,
      "loss": 0.0001,
      "num_tokens": 9417047.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 340,
      "step_time": 17.50099764764309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 181.6875,
      "completions/mean_terminated_length": 181.6875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3823828175663948,
      "epoch": 0.015794349235757295,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007550466689281166,
      "kl": 0.0013036046584602445,
      "learning_rate": 9.968503937007873e-07,
      "loss": 0.0001,
      "num_tokens": 9440706.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 341,
      "step_time": 19.994986213743687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.42326007038354874,
      "epoch": 0.015840666975451598,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004449597036000341,
      "kl": 0.0012013615923933685,
      "learning_rate": 9.968411301528486e-07,
      "loss": 0.0001,
      "num_tokens": 9461901.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 342,
      "step_time": 19.592860255390406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 117.6875,
      "completions/mean_terminated_length": 117.6875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.25822774320840836,
      "epoch": 0.0158869847151459,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022317171096801758,
      "kl": 0.001495075732236728,
      "learning_rate": 9.968318666049095e-07,
      "loss": 0.0001,
      "num_tokens": 9482216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 343,
      "step_time": 13.19613303616643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 107.6875,
      "completions/mean_terminated_length": 107.6875,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.30405353754758835,
      "epoch": 0.015933302454840204,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007039046613499522,
      "kl": 0.0011159517744090408,
      "learning_rate": 9.968226030569707e-07,
      "loss": 0.0001,
      "num_tokens": 9503075.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 344,
      "step_time": 12.520101103931665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 159.875,
      "completions/mean_terminated_length": 159.875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.32665305584669113,
      "epoch": 0.015979620194534506,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006344412686303258,
      "kl": 0.0013704613083973527,
      "learning_rate": 9.96813339509032e-07,
      "loss": 0.0001,
      "num_tokens": 9538001.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 345,
      "step_time": 20.899267457425594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 159.0625,
      "completions/mean_terminated_length": 159.0625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.21141353622078896,
      "epoch": 0.01602593793422881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004040278145112097,
      "kl": 0.0006534120911965147,
      "learning_rate": 9.968040759610931e-07,
      "loss": 0.0,
      "num_tokens": 9567522.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 346,
      "step_time": 17.915325086563826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 351.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 237.5,
      "completions/mean_terminated_length": 237.5,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.3383919596672058,
      "epoch": 0.016072255673923112,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.059448838233947754,
      "kl": 0.0011732338170986623,
      "learning_rate": 9.967948124131542e-07,
      "loss": -0.0441,
      "num_tokens": 9605194.0,
      "reward": 0.3904230296611786,
      "reward_std": 0.3123384416103363,
      "rewards/reward_func/mean": 0.3904230296611786,
      "rewards/reward_func/std": 0.3123384416103363,
      "step": 347,
      "step_time": 32.66446267068386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 148.5625,
      "completions/mean_terminated_length": 148.5625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3652502968907356,
      "epoch": 0.016118573413617415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001007839571684599,
      "kl": 0.001655446772929281,
      "learning_rate": 9.967855488652154e-07,
      "loss": 0.0001,
      "num_tokens": 9628371.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 348,
      "step_time": 16.39089348167181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 161.5625,
      "completions/mean_terminated_length": 161.5625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.3960127979516983,
      "epoch": 0.016164891153311718,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006725586717948318,
      "kl": 0.001324401848251,
      "learning_rate": 9.967762853172765e-07,
      "loss": 0.0001,
      "num_tokens": 9653612.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 349,
      "step_time": 17.55132332444191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 172.0625,
      "completions/mean_terminated_length": 172.0625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.20959357172250748,
      "epoch": 0.01621120889300602,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08087478578090668,
      "kl": 0.0009161113994196057,
      "learning_rate": 9.967670217693376e-07,
      "loss": 0.0043,
      "num_tokens": 9683773.0,
      "reward": 0.34488698840141296,
      "reward_std": 0.09196986258029938,
      "rewards/reward_func/mean": 0.34488698840141296,
      "rewards/reward_func/std": 0.09196987003087997,
      "step": 350,
      "step_time": 19.12022588402033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 188.9375,
      "completions/mean_terminated_length": 188.9375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3701925128698349,
      "epoch": 0.016257526632700324,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000581101980060339,
      "kl": 0.0014219658623915166,
      "learning_rate": 9.967577582213987e-07,
      "loss": 0.0001,
      "num_tokens": 9711388.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 351,
      "step_time": 20.990219868719578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 200.4375,
      "completions/mean_terminated_length": 200.4375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.36787600815296173,
      "epoch": 0.016303844372394627,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07708732038736343,
      "kl": 0.0013787887146463618,
      "learning_rate": 9.967484946734599e-07,
      "loss": -0.1613,
      "num_tokens": 9733347.0,
      "reward": 0.21756526827812195,
      "reward_std": 0.39111003279685974,
      "rewards/reward_func/mean": 0.21756526827812195,
      "rewards/reward_func/std": 0.39111006259918213,
      "step": 352,
      "step_time": 25.04696473479271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 192.8125,
      "completions/mean_terminated_length": 192.8125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.18251312896609306,
      "epoch": 0.01635016211208893,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06357643008232117,
      "kl": 0.001038837610394694,
      "learning_rate": 9.96739231125521e-07,
      "loss": -0.0092,
      "num_tokens": 9771056.0,
      "reward": 0.7122365236282349,
      "reward_std": 0.23176871240139008,
      "rewards/reward_func/mean": 0.7122365236282349,
      "rewards/reward_func/std": 0.23176871240139008,
      "step": 353,
      "step_time": 22.896654035896063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 233.625,
      "completions/mean_terminated_length": 233.625,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "entropy": 0.16217869520187378,
      "epoch": 0.016396479851783233,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00035185046726837754,
      "kl": 0.0006140474142739549,
      "learning_rate": 9.967299675775821e-07,
      "loss": 0.0,
      "num_tokens": 9797290.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 354,
      "step_time": 22.49565551057458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 171.0625,
      "completions/mean_terminated_length": 171.0625,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.39254141598939896,
      "epoch": 0.016442797591477536,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011137102264910936,
      "kl": 0.00146244463394396,
      "learning_rate": 9.967207040296432e-07,
      "loss": 0.0001,
      "num_tokens": 9845755.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 355,
      "step_time": 24.57612419500947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 128.5,
      "completions/mean_terminated_length": 128.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.19935456290841103,
      "epoch": 0.01648911533117184,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000986173516139388,
      "kl": 0.0009092786349356174,
      "learning_rate": 9.967114404817044e-07,
      "loss": 0.0,
      "num_tokens": 9865203.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 356,
      "step_time": 13.655712105333805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 118.875,
      "completions/mean_terminated_length": 118.875,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.21422869712114334,
      "epoch": 0.01653543307086614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006323560373857617,
      "kl": 0.0007846915978007019,
      "learning_rate": 9.967021769337655e-07,
      "loss": 0.0,
      "num_tokens": 9886929.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 357,
      "step_time": 14.202369064092636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 170.0,
      "completions/mean_terminated_length": 170.0,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.17644475027918816,
      "epoch": 0.016581750810560444,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07472806423902512,
      "kl": 0.0007407554658129811,
      "learning_rate": 9.966929133858266e-07,
      "loss": -0.0007,
      "num_tokens": 9909873.0,
      "reward": 0.9164585471153259,
      "reward_std": 0.03093360736966133,
      "rewards/reward_func/mean": 0.9164585471153259,
      "rewards/reward_func/std": 0.03093361109495163,
      "step": 358,
      "step_time": 17.908420998603106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 135.0625,
      "completions/mean_terminated_length": 135.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.36569082736968994,
      "epoch": 0.016628068550254747,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007995835039764643,
      "kl": 0.0013750218786299229,
      "learning_rate": 9.96683649837888e-07,
      "loss": 0.0001,
      "num_tokens": 9945794.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 359,
      "step_time": 18.089062709361315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 121.0,
      "completions/max_terminated_length": 121.0,
      "completions/mean_length": 110.8125,
      "completions/mean_terminated_length": 110.8125,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.26153818517923355,
      "epoch": 0.01667438628994905,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011588912457227707,
      "kl": 0.0012160380429122597,
      "learning_rate": 9.96674386289949e-07,
      "loss": 0.0001,
      "num_tokens": 9965647.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 360,
      "step_time": 12.353236109018326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 196.0,
      "completions/mean_terminated_length": 196.0,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.2633475586771965,
      "epoch": 0.016720704029643353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07660730928182602,
      "kl": 0.0011788938863901421,
      "learning_rate": 9.9666512274201e-07,
      "loss": 0.0141,
      "num_tokens": 9995535.0,
      "reward": 0.9712770581245422,
      "reward_std": 0.11489176005125046,
      "rewards/reward_func/mean": 0.9712770581245422,
      "rewards/reward_func/std": 0.11489175260066986,
      "step": 361,
      "step_time": 24.44135208800435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 120.75,
      "completions/mean_terminated_length": 120.75,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2666449323296547,
      "epoch": 0.016767021769337656,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006468050414696336,
      "kl": 0.0010750905494205654,
      "learning_rate": 9.966558591940713e-07,
      "loss": 0.0001,
      "num_tokens": 10016587.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 362,
      "step_time": 14.386831555515528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 152.0,
      "completions/mean_terminated_length": 152.0,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.32977041602134705,
      "epoch": 0.01681333950903196,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009706729324534535,
      "kl": 0.0012442492734408006,
      "learning_rate": 9.966465956461325e-07,
      "loss": 0.0001,
      "num_tokens": 10043947.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 363,
      "step_time": 17.256726995110512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 139.5625,
      "completions/mean_terminated_length": 139.5625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3330072611570358,
      "epoch": 0.016859657248726262,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009713650797493756,
      "kl": 0.0013287764450069517,
      "learning_rate": 9.966373320981936e-07,
      "loss": 0.0001,
      "num_tokens": 10079988.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 364,
      "step_time": 18.95854353159666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3273973688483238,
      "epoch": 0.016905974988420565,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006421417347155511,
      "kl": 0.0012357144441921264,
      "learning_rate": 9.966280685502547e-07,
      "loss": 0.0001,
      "num_tokens": 10100770.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 365,
      "step_time": 17.175654880702496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 288.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 202.8125,
      "completions/mean_terminated_length": 202.8125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.30211323499679565,
      "epoch": 0.016952292728114868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007135492283850908,
      "kl": 0.008094704942777753,
      "learning_rate": 9.966188050023158e-07,
      "loss": 0.0004,
      "num_tokens": 10126799.0,
      "reward": 0.8668779134750366,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8668779134750366,
      "rewards/reward_func/std": 0.0,
      "step": 366,
      "step_time": 24.911034680902958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 157.8125,
      "completions/mean_terminated_length": 157.8125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3083705008029938,
      "epoch": 0.01699861046780917,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006156533490866423,
      "kl": 0.0010671147028915584,
      "learning_rate": 9.96609541454377e-07,
      "loss": 0.0001,
      "num_tokens": 10152428.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 367,
      "step_time": 17.69282505661249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 184.9375,
      "completions/mean_terminated_length": 184.9375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3511301353573799,
      "epoch": 0.017044928207503474,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08407903462648392,
      "kl": 0.0012649931013584137,
      "learning_rate": 9.96600277906438e-07,
      "loss": -0.007,
      "num_tokens": 10175419.0,
      "reward": 0.4160774350166321,
      "reward_std": 0.4874512851238251,
      "rewards/reward_func/mean": 0.4160774350166321,
      "rewards/reward_func/std": 0.48745131492614746,
      "step": 368,
      "step_time": 19.776455257087946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 125.6875,
      "completions/mean_terminated_length": 125.6875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2581064775586128,
      "epoch": 0.017091245947197777,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013785755727440119,
      "kl": 0.00167951884213835,
      "learning_rate": 9.965910143584992e-07,
      "loss": 0.0001,
      "num_tokens": 10195782.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 369,
      "step_time": 13.841001875698566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 181.3125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.22950803488492966,
      "epoch": 0.01713756368689208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007128478027880192,
      "kl": 0.0009088489605346695,
      "learning_rate": 9.965817508105603e-07,
      "loss": 0.0,
      "num_tokens": 10221691.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 370,
      "step_time": 19.305286843329668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 270.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 214.0,
      "completions/mean_terminated_length": 214.0,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.31679457426071167,
      "epoch": 0.017183881426586382,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.048416707664728165,
      "kl": 0.0011358647898305207,
      "learning_rate": 9.965724872626215e-07,
      "loss": -0.065,
      "num_tokens": 10251579.0,
      "reward": 0.9204668998718262,
      "reward_std": 0.2546977400779724,
      "rewards/reward_func/mean": 0.9204668998718262,
      "rewards/reward_func/std": 0.2546977400779724,
      "step": 371,
      "step_time": 24.533843584358692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 184.0625,
      "completions/mean_terminated_length": 184.0625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.1564195305109024,
      "epoch": 0.017230199166280685,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004370823735371232,
      "kl": 0.0006024371250532568,
      "learning_rate": 9.965632237146828e-07,
      "loss": 0.0,
      "num_tokens": 10286716.0,
      "reward": 0.8385766744613647,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8385766744613647,
      "rewards/reward_func/std": 0.0,
      "step": 372,
      "step_time": 21.68499232083559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 113.9375,
      "completions/mean_terminated_length": 113.9375,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.282099224627018,
      "epoch": 0.017276516905974988,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009552930714562535,
      "kl": 0.0012493654940044507,
      "learning_rate": 9.96553960166744e-07,
      "loss": 0.0001,
      "num_tokens": 10307515.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 373,
      "step_time": 15.568065013736486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 183.75,
      "completions/mean_terminated_length": 183.75,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.4235863834619522,
      "epoch": 0.01732283464566929,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008119754493236542,
      "kl": 0.0013793996185995638,
      "learning_rate": 9.965446966188048e-07,
      "loss": 0.0001,
      "num_tokens": 10337463.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 374,
      "step_time": 21.269214287400246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 125.375,
      "completions/mean_terminated_length": 125.375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2954377830028534,
      "epoch": 0.017369152385363594,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00681885052472353,
      "kl": 0.003083104733377695,
      "learning_rate": 9.965354330708662e-07,
      "loss": 0.0002,
      "num_tokens": 10373789.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 375,
      "step_time": 17.8924512937665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 208.1875,
      "completions/mean_terminated_length": 208.1875,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.22619051113724709,
      "epoch": 0.017415470125057897,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009632823057472706,
      "kl": 0.001133952711825259,
      "learning_rate": 9.965261695229273e-07,
      "loss": 0.0001,
      "num_tokens": 10404960.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 376,
      "step_time": 22.028299398720264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 169.75,
      "completions/mean_terminated_length": 169.75,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.37804020941257477,
      "epoch": 0.0174617878647522,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007527231355197728,
      "kl": 0.0013605688291136175,
      "learning_rate": 9.965169059749884e-07,
      "loss": 0.0001,
      "num_tokens": 10432332.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 377,
      "step_time": 18.87598704174161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 152.5625,
      "completions/mean_terminated_length": 152.5625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.42359697818756104,
      "epoch": 0.017508105604446503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008097761310636997,
      "kl": 0.001501502498285845,
      "learning_rate": 9.965076424270495e-07,
      "loss": 0.0001,
      "num_tokens": 10483685.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 378,
      "step_time": 22.994311198592186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 134.0,
      "completions/mean_terminated_length": 134.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.31661297380924225,
      "epoch": 0.017554423344140806,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008577098487876356,
      "kl": 0.0014004443655721843,
      "learning_rate": 9.964983788791107e-07,
      "loss": 0.0001,
      "num_tokens": 10505093.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 379,
      "step_time": 15.779139500111341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 179.0625,
      "completions/mean_terminated_length": 179.0625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.387944720685482,
      "epoch": 0.01760074108383511,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007746884948574007,
      "kl": 0.0016006549121811986,
      "learning_rate": 9.964891153311718e-07,
      "loss": 0.0001,
      "num_tokens": 10564198.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 380,
      "step_time": 29.4970294944942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 185.9375,
      "completions/mean_terminated_length": 185.9375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.42155060917139053,
      "epoch": 0.01764705882352941,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006630704738199711,
      "kl": 0.0011422887037042528,
      "learning_rate": 9.96479851783233e-07,
      "loss": 0.0001,
      "num_tokens": 10590917.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 381,
      "step_time": 19.99205644056201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 183.375,
      "completions/mean_terminated_length": 183.375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.33681362122297287,
      "epoch": 0.017693376563223714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008519992115907371,
      "kl": 0.001492333714850247,
      "learning_rate": 9.96470588235294e-07,
      "loss": 0.0001,
      "num_tokens": 10618075.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 382,
      "step_time": 20.17256711423397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 154.9375,
      "completions/mean_terminated_length": 154.9375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.322447806596756,
      "epoch": 0.017739694302918017,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006841674912720919,
      "kl": 0.0012063782196491957,
      "learning_rate": 9.964613246873552e-07,
      "loss": 0.0001,
      "num_tokens": 10639658.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 383,
      "step_time": 16.244937404990196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 190.0625,
      "completions/mean_terminated_length": 190.0625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.1852053627371788,
      "epoch": 0.01778601204261232,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005796203622594476,
      "kl": 0.0007577495707664639,
      "learning_rate": 9.964520611394163e-07,
      "loss": 0.0,
      "num_tokens": 10678235.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 384,
      "step_time": 26.06255165860057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 199.25,
      "completions/mean_terminated_length": 199.25,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3715161606669426,
      "epoch": 0.017832329782306623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007285014144144952,
      "kl": 0.0013633314811158925,
      "learning_rate": 9.964427975914776e-07,
      "loss": 0.0001,
      "num_tokens": 10699711.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 385,
      "step_time": 19.731825433671474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 210.0,
      "completions/mean_terminated_length": 210.0,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.26311708986759186,
      "epoch": 0.017878647522000926,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07385022193193436,
      "kl": 0.001333654799964279,
      "learning_rate": 9.964335340435385e-07,
      "loss": -0.0118,
      "num_tokens": 10734239.0,
      "reward": 0.9915130138397217,
      "reward_std": 0.015181982889771461,
      "rewards/reward_func/mean": 0.9915130138397217,
      "rewards/reward_func/std": 0.015181982889771461,
      "step": 386,
      "step_time": 23.218905702233315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 180.75,
      "completions/mean_terminated_length": 180.75,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.24060431122779846,
      "epoch": 0.01792496526169523,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12353584170341492,
      "kl": 0.0016247373714577407,
      "learning_rate": 9.964242704955997e-07,
      "loss": -0.0333,
      "num_tokens": 10759387.0,
      "reward": 0.13478919863700867,
      "reward_std": 0.11306636780500412,
      "rewards/reward_func/mean": 0.13478919863700867,
      "rewards/reward_func/std": 0.11306636780500412,
      "step": 387,
      "step_time": 20.02836049720645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 135.0,
      "completions/mean_terminated_length": 135.0,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.29737626016139984,
      "epoch": 0.017971283001389532,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006051292875781655,
      "kl": 0.0011766636307584122,
      "learning_rate": 9.964150069476608e-07,
      "loss": 0.0001,
      "num_tokens": 10780139.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 388,
      "step_time": 13.97381442412734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 130.0,
      "completions/mean_terminated_length": 130.0,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.24731147661805153,
      "epoch": 0.018017600741083835,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00118044123519212,
      "kl": 0.0010196207003900781,
      "learning_rate": 9.964057433997221e-07,
      "loss": 0.0001,
      "num_tokens": 10799963.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 389,
      "step_time": 13.613224472850561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 161.5,
      "completions/mean_terminated_length": 161.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4084602892398834,
      "epoch": 0.018063918480778138,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006236043991521001,
      "kl": 0.0011252271651756018,
      "learning_rate": 9.963964798517833e-07,
      "loss": 0.0001,
      "num_tokens": 10833107.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 390,
      "step_time": 19.000489212572575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 133.3125,
      "completions/mean_terminated_length": 133.3125,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2630554661154747,
      "epoch": 0.01811023622047244,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000608460686635226,
      "kl": 0.0009470109653193504,
      "learning_rate": 9.963872163038444e-07,
      "loss": 0.0,
      "num_tokens": 10859032.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 391,
      "step_time": 16.39188402891159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 181.9375,
      "completions/mean_terminated_length": 181.9375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3491543009877205,
      "epoch": 0.018156553960166744,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0723331868648529,
      "kl": 0.0010232469794573262,
      "learning_rate": 9.963779527559055e-07,
      "loss": 0.0248,
      "num_tokens": 10879879.0,
      "reward": 0.6944708228111267,
      "reward_std": 0.4141024053096771,
      "rewards/reward_func/mean": 0.6944708228111267,
      "rewards/reward_func/std": 0.4141024053096771,
      "step": 392,
      "step_time": 21.867709532380104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 118.4375,
      "completions/mean_terminated_length": 118.4375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.27682629972696304,
      "epoch": 0.018202871699861047,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010108175920322537,
      "kl": 0.0014717382146045566,
      "learning_rate": 9.963686892079666e-07,
      "loss": 0.0001,
      "num_tokens": 10900654.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 393,
      "step_time": 13.679325930774212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 174.875,
      "completions/mean_terminated_length": 174.875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.42017994076013565,
      "epoch": 0.01824918943955535,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009347493760287762,
      "kl": 0.0015324850683100522,
      "learning_rate": 9.963594256600278e-07,
      "loss": 0.0001,
      "num_tokens": 10924332.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 394,
      "step_time": 19.324608132243156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 121.875,
      "completions/mean_terminated_length": 121.875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.23197819292545319,
      "epoch": 0.018295507179249652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008783585508354008,
      "kl": 0.0011164712195750326,
      "learning_rate": 9.963501621120889e-07,
      "loss": 0.0001,
      "num_tokens": 10943642.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 395,
      "step_time": 14.114130672067404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 194.5625,
      "completions/mean_terminated_length": 194.5625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.241884246468544,
      "epoch": 0.018341824918943955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007886892999522388,
      "kl": 0.0010119648650288582,
      "learning_rate": 9.9634089856415e-07,
      "loss": 0.0001,
      "num_tokens": 10967651.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 396,
      "step_time": 20.774743512272835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 182.375,
      "completions/mean_terminated_length": 182.375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.1793980784714222,
      "epoch": 0.018388142658638258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007735713152214885,
      "kl": 0.0008657000289531425,
      "learning_rate": 9.963316350162111e-07,
      "loss": 0.0,
      "num_tokens": 10991209.0,
      "reward": 0.9574533700942993,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9574533700942993,
      "rewards/reward_func/std": 0.0,
      "step": 397,
      "step_time": 19.160449791699648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 165.125,
      "completions/mean_terminated_length": 165.125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.4707822874188423,
      "epoch": 0.01843446039833256,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004372514085844159,
      "kl": 0.0012005920871160924,
      "learning_rate": 9.963223714682723e-07,
      "loss": 0.0001,
      "num_tokens": 11026235.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 398,
      "step_time": 20.6530371196568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 114.6875,
      "completions/mean_terminated_length": 114.6875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2661431133747101,
      "epoch": 0.018480778138026864,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000946751213632524,
      "kl": 0.0011715970758814365,
      "learning_rate": 9.963131079203334e-07,
      "loss": 0.0001,
      "num_tokens": 11046150.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 399,
      "step_time": 13.03931139409542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 141.875,
      "completions/mean_terminated_length": 141.875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3072554022073746,
      "epoch": 0.018527095877721167,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.043054718524217606,
      "kl": 0.007840050500817597,
      "learning_rate": 9.963038443723945e-07,
      "loss": 0.0004,
      "num_tokens": 11067684.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 400,
      "step_time": 16.174638397991657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 176.25,
      "completions/mean_terminated_length": 176.25,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.24510730803012848,
      "epoch": 0.01857341361741547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007515113684348762,
      "kl": 0.0011971300991717726,
      "learning_rate": 9.962945808244556e-07,
      "loss": 0.0001,
      "num_tokens": 11099912.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 401,
      "step_time": 19.840820968151093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 193.0625,
      "completions/mean_terminated_length": 193.0625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.18754839524626732,
      "epoch": 0.018619731357109773,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005197523278184235,
      "kl": 0.000750317718484439,
      "learning_rate": 9.96285317276517e-07,
      "loss": 0.0,
      "num_tokens": 11125481.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 402,
      "step_time": 21.889157086610794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 144.75,
      "completions/mean_terminated_length": 144.75,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.20500480011105537,
      "epoch": 0.018666049096804076,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009929519146680832,
      "kl": 0.0012351693003438413,
      "learning_rate": 9.96276053728578e-07,
      "loss": 0.0001,
      "num_tokens": 11146165.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 403,
      "step_time": 15.625893365591764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 124.25,
      "completions/mean_terminated_length": 124.25,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2573729380965233,
      "epoch": 0.01871236683649838,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011042029364034534,
      "kl": 0.001303912722505629,
      "learning_rate": 9.962667901806392e-07,
      "loss": 0.0001,
      "num_tokens": 11165529.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 404,
      "step_time": 13.343416448682547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 138.75,
      "completions/mean_terminated_length": 138.75,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3041428029537201,
      "epoch": 0.01875868457619268,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005377588095143437,
      "kl": 0.0009079412993742153,
      "learning_rate": 9.962575266327003e-07,
      "loss": 0.0,
      "num_tokens": 11187141.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 405,
      "step_time": 14.533459562808275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 172.25,
      "completions/mean_terminated_length": 172.25,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.16587964445352554,
      "epoch": 0.018805002315886984,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001329808379523456,
      "kl": 0.0007990232334122993,
      "learning_rate": 9.962482630847615e-07,
      "loss": 0.0,
      "num_tokens": 11222169.0,
      "reward": 0.9091564416885376,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9091564416885376,
      "rewards/reward_func/std": 0.0,
      "step": 406,
      "step_time": 21.331826210021973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 134.375,
      "completions/mean_terminated_length": 134.375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3556923270225525,
      "epoch": 0.018851320055581287,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015332532348111272,
      "kl": 0.002596778911538422,
      "learning_rate": 9.962389995368226e-07,
      "loss": 0.0001,
      "num_tokens": 11279503.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 407,
      "step_time": 24.147029418498278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 166.625,
      "completions/mean_terminated_length": 166.625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3321279138326645,
      "epoch": 0.01889763779527559,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00116622110363096,
      "kl": 0.0012340332905296236,
      "learning_rate": 9.962297359888837e-07,
      "loss": 0.0001,
      "num_tokens": 11300841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 408,
      "step_time": 18.532040812075138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 197.5,
      "completions/mean_terminated_length": 197.5,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.2635272741317749,
      "epoch": 0.018943955534969893,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08912134915590286,
      "kl": 0.0022788423666497692,
      "learning_rate": 9.962204724409448e-07,
      "loss": -0.0253,
      "num_tokens": 11327393.0,
      "reward": 0.8944562077522278,
      "reward_std": 0.07349126785993576,
      "rewards/reward_func/mean": 0.8944562077522278,
      "rewards/reward_func/std": 0.07349127531051636,
      "step": 409,
      "step_time": 22.85629679635167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.19783100858330727,
      "epoch": 0.018990273274664196,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07475640624761581,
      "kl": 0.0014248900697566569,
      "learning_rate": 9.96211208893006e-07,
      "loss": 0.0045,
      "num_tokens": 11364684.0,
      "reward": 0.9977275133132935,
      "reward_std": 0.009089890867471695,
      "rewards/reward_func/mean": 0.9977275133132935,
      "rewards/reward_func/std": 0.009089887142181396,
      "step": 410,
      "step_time": 22.609322797507048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 238.5625,
      "completions/mean_terminated_length": 238.5625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "entropy": 0.28156252205371857,
      "epoch": 0.0190365910143585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005377965862862766,
      "kl": 0.0008603379392297938,
      "learning_rate": 9.96201945345067e-07,
      "loss": 0.0,
      "num_tokens": 11387413.0,
      "reward": 0.9017226696014404,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9017226696014404,
      "rewards/reward_func/std": 0.0,
      "step": 411,
      "step_time": 23.784574691206217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 177.25,
      "completions/mean_terminated_length": 177.25,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.17503974959254265,
      "epoch": 0.019082908754052802,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06746372580528259,
      "kl": 0.0008583853050367907,
      "learning_rate": 9.961926817971282e-07,
      "loss": -0.0901,
      "num_tokens": 11419353.0,
      "reward": 0.8225798606872559,
      "reward_std": 0.23737215995788574,
      "rewards/reward_func/mean": 0.8225798606872559,
      "rewards/reward_func/std": 0.23737215995788574,
      "step": 412,
      "step_time": 20.757130481302738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 231.1875,
      "completions/mean_terminated_length": 231.1875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3314223960042,
      "epoch": 0.019129226493747105,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06072376295924187,
      "kl": 0.0010566960263531655,
      "learning_rate": 9.961834182491893e-07,
      "loss": -0.0666,
      "num_tokens": 11457596.0,
      "reward": 0.5625,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.5625,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 413,
      "step_time": 27.22598084807396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 124.6875,
      "completions/mean_terminated_length": 124.6875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.26997319608926773,
      "epoch": 0.019175544233441408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008293652208521962,
      "kl": 0.0010149930603802204,
      "learning_rate": 9.961741547012505e-07,
      "loss": 0.0001,
      "num_tokens": 11480199.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 414,
      "step_time": 14.77862561494112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 173.0625,
      "completions/mean_terminated_length": 173.0625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.4134935438632965,
      "epoch": 0.01922186197313571,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008249669335782528,
      "kl": 0.001957462925929576,
      "learning_rate": 9.961648911533118e-07,
      "loss": 0.0001,
      "num_tokens": 11509320.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 415,
      "step_time": 20.258842054754496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 203.8125,
      "completions/mean_terminated_length": 203.8125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.39648277312517166,
      "epoch": 0.019268179712830014,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002531531034037471,
      "kl": 0.001678121363511309,
      "learning_rate": 9.96155627605373e-07,
      "loss": 0.0001,
      "num_tokens": 11535669.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 416,
      "step_time": 21.772209532558918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 203.375,
      "completions/mean_terminated_length": 203.375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.45256636291742325,
      "epoch": 0.019314497452524317,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009294411283917725,
      "kl": 0.0014201795274857432,
      "learning_rate": 9.961463640574338e-07,
      "loss": 0.0001,
      "num_tokens": 11561291.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 417,
      "step_time": 22.37670413777232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 120.875,
      "completions/mean_terminated_length": 120.875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2436264269053936,
      "epoch": 0.01936081519221862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012272412423044443,
      "kl": 0.001068125871825032,
      "learning_rate": 9.96137100509495e-07,
      "loss": 0.0001,
      "num_tokens": 11581721.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 418,
      "step_time": 13.968612048774958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 192.6875,
      "completions/mean_terminated_length": 192.6875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.26581909507513046,
      "epoch": 0.019407132931912922,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004420572950039059,
      "kl": 0.0008505451696692035,
      "learning_rate": 9.961278369615563e-07,
      "loss": 0.0,
      "num_tokens": 11606228.0,
      "reward": 0.9146912097930908,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9146912097930908,
      "rewards/reward_func/std": 0.0,
      "step": 419,
      "step_time": 19.52475217729807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 177.25,
      "completions/mean_terminated_length": 177.25,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.23862052708864212,
      "epoch": 0.019453450671607225,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.089634008705616,
      "kl": 0.000901429884834215,
      "learning_rate": 9.961185734136174e-07,
      "loss": 0.0194,
      "num_tokens": 11627816.0,
      "reward": 0.8680884838104248,
      "reward_std": 0.23149026930332184,
      "rewards/reward_func/mean": 0.8680884838104248,
      "rewards/reward_func/std": 0.23149026930332184,
      "step": 420,
      "step_time": 22.91817284375429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 173.25,
      "completions/mean_terminated_length": 173.25,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.41315415501594543,
      "epoch": 0.019499768411301528,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009287703433074057,
      "kl": 0.0015140810573939234,
      "learning_rate": 9.961093098656785e-07,
      "loss": 0.0001,
      "num_tokens": 11649564.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 421,
      "step_time": 18.555513385683298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 163.5625,
      "completions/mean_terminated_length": 163.5625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3890605941414833,
      "epoch": 0.01954608615099583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006287552532739937,
      "kl": 0.0012807418243028224,
      "learning_rate": 9.961000463177397e-07,
      "loss": 0.0001,
      "num_tokens": 11697301.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 422,
      "step_time": 22.50880254805088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 176.0,
      "completions/mean_terminated_length": 176.0,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.40944691747426987,
      "epoch": 0.019592403890690134,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01755422353744507,
      "kl": 0.004535476444289088,
      "learning_rate": 9.960907827698008e-07,
      "loss": 0.0002,
      "num_tokens": 11732965.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 423,
      "step_time": 21.59409398585558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 139.4375,
      "completions/mean_terminated_length": 139.4375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.31128521263599396,
      "epoch": 0.019638721630384437,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010005016811192036,
      "kl": 0.0014900580572430044,
      "learning_rate": 9.96081519221862e-07,
      "loss": 0.0001,
      "num_tokens": 11766268.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 424,
      "step_time": 18.541524816304445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 147.5,
      "completions/mean_terminated_length": 147.5,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.1634841077029705,
      "epoch": 0.01968503937007874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008766019018366933,
      "kl": 0.0009638566698413342,
      "learning_rate": 9.96072255673923e-07,
      "loss": 0.0,
      "num_tokens": 11790020.0,
      "reward": 0.7958667874336243,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7958667874336243,
      "rewards/reward_func/std": 0.0,
      "step": 425,
      "step_time": 15.64836959168315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 118.375,
      "completions/mean_terminated_length": 118.375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.27606356143951416,
      "epoch": 0.019731357109773043,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008227747166529298,
      "kl": 0.0011175721156178042,
      "learning_rate": 9.960629921259842e-07,
      "loss": 0.0001,
      "num_tokens": 11813402.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 426,
      "step_time": 14.213473957031965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 186.5,
      "completions/mean_terminated_length": 186.5,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.1671118028461933,
      "epoch": 0.019777674849467346,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.059474650770425797,
      "kl": 0.0008242270705522969,
      "learning_rate": 9.960537285780453e-07,
      "loss": 0.011,
      "num_tokens": 11847298.0,
      "reward": 0.9714365601539612,
      "reward_std": 0.029500193893909454,
      "rewards/reward_func/mean": 0.9714365601539612,
      "rewards/reward_func/std": 0.029500195756554604,
      "step": 427,
      "step_time": 20.827589195221663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 173.5625,
      "completions/mean_terminated_length": 173.5625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.2779013514518738,
      "epoch": 0.01982399258916165,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06270330399274826,
      "kl": 0.001159066567197442,
      "learning_rate": 9.960444650301064e-07,
      "loss": -0.0458,
      "num_tokens": 11870059.0,
      "reward": 0.4364950656890869,
      "reward_std": 0.04501661658287048,
      "rewards/reward_func/mean": 0.4364950656890869,
      "rewards/reward_func/std": 0.04501662030816078,
      "step": 428,
      "step_time": 19.117902901023626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 134.3125,
      "completions/mean_terminated_length": 134.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.324234202504158,
      "epoch": 0.01987031032885595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010779553558677435,
      "kl": 0.0014260683383326977,
      "learning_rate": 9.960352014821675e-07,
      "loss": 0.0001,
      "num_tokens": 11898240.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 429,
      "step_time": 16.36806231737137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 184.4375,
      "completions/mean_terminated_length": 184.4375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.21080321818590164,
      "epoch": 0.019916628068550254,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005658076261170208,
      "kl": 0.0007707481126999483,
      "learning_rate": 9.960259379342287e-07,
      "loss": 0.0,
      "num_tokens": 11939351.0,
      "reward": 0.8611735105514526,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8611735105514526,
      "rewards/reward_func/std": 0.0,
      "step": 430,
      "step_time": 24.574472688138485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 190.4375,
      "completions/mean_terminated_length": 190.4375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2915353327989578,
      "epoch": 0.019962945808244557,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05731703341007233,
      "kl": 0.0008475943322991952,
      "learning_rate": 9.960166743862898e-07,
      "loss": -0.0397,
      "num_tokens": 11960782.0,
      "reward": 0.5870636701583862,
      "reward_std": 0.47294896841049194,
      "rewards/reward_func/mean": 0.5870636701583862,
      "rewards/reward_func/std": 0.47294896841049194,
      "step": 431,
      "step_time": 19.433797158300877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 176.4375,
      "completions/mean_terminated_length": 176.4375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.18618595600128174,
      "epoch": 0.02000926354793886,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00048418284859508276,
      "kl": 0.000596129655605182,
      "learning_rate": 9.960074108383511e-07,
      "loss": 0.0,
      "num_tokens": 12015493.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 432,
      "step_time": 26.345379684120417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 182.8125,
      "completions/mean_terminated_length": 182.8125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.35556047409772873,
      "epoch": 0.020055581287633163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000724254350643605,
      "kl": 0.0011773772130254656,
      "learning_rate": 9.959981472904123e-07,
      "loss": 0.0001,
      "num_tokens": 12068578.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 433,
      "step_time": 27.155123606324196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 151.375,
      "completions/mean_terminated_length": 151.375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.1845146119594574,
      "epoch": 0.020101899027327466,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000514318875502795,
      "kl": 0.000833735044579953,
      "learning_rate": 9.959888837424734e-07,
      "loss": 0.0,
      "num_tokens": 12098552.0,
      "reward": 0.3916056156158447,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3916056156158447,
      "rewards/reward_func/std": 0.0,
      "step": 434,
      "step_time": 17.356296803802252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 110.0,
      "completions/mean_terminated_length": 110.0,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.27629970014095306,
      "epoch": 0.02014821676702177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009520486346445978,
      "kl": 0.0012683477252721786,
      "learning_rate": 9.959796201945345e-07,
      "loss": 0.0001,
      "num_tokens": 12118168.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 435,
      "step_time": 12.731515988707542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 181.9375,
      "completions/mean_terminated_length": 181.9375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.1562896929681301,
      "epoch": 0.020194534506716072,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003385805175639689,
      "kl": 0.0005081234630779363,
      "learning_rate": 9.959703566465956e-07,
      "loss": 0.0,
      "num_tokens": 12151799.0,
      "reward": 0.9000876545906067,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9000876545906067,
      "rewards/reward_func/std": 0.0,
      "step": 436,
      "step_time": 20.731846310198307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 111.5625,
      "completions/mean_terminated_length": 111.5625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.27672744169831276,
      "epoch": 0.020240852246410375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011817258782684803,
      "kl": 0.0015236828476190567,
      "learning_rate": 9.959610930986568e-07,
      "loss": 0.0001,
      "num_tokens": 12173632.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 437,
      "step_time": 13.193745326250792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 162.0625,
      "completions/mean_terminated_length": 162.0625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.16578193753957748,
      "epoch": 0.020287169986104678,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10385134816169739,
      "kl": 0.0011733440915122628,
      "learning_rate": 9.959518295507179e-07,
      "loss": 0.0098,
      "num_tokens": 12218161.0,
      "reward": 0.8985783457756042,
      "reward_std": 0.05031924694776535,
      "rewards/reward_func/mean": 0.8985783457756042,
      "rewards/reward_func/std": 0.05031923949718475,
      "step": 438,
      "step_time": 22.68815726041794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 203.5,
      "completions/mean_terminated_length": 203.5,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.18357586860656738,
      "epoch": 0.02033348772579898,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.045642703771591187,
      "kl": 0.0006995180883677676,
      "learning_rate": 9.95942566002779e-07,
      "loss": -0.0237,
      "num_tokens": 12258713.0,
      "reward": 0.04365791007876396,
      "reward_std": 0.010465163737535477,
      "rewards/reward_func/mean": 0.04365791007876396,
      "rewards/reward_func/std": 0.010465164668858051,
      "step": 439,
      "step_time": 23.417887415736914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 175.875,
      "completions/mean_terminated_length": 175.875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3608877509832382,
      "epoch": 0.020379805465493284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020068450830876827,
      "kl": 0.002231194492196664,
      "learning_rate": 9.959333024548401e-07,
      "loss": 0.0001,
      "num_tokens": 12289719.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 440,
      "step_time": 20.223703049123287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 139.1875,
      "completions/mean_terminated_length": 139.1875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.21962684765458107,
      "epoch": 0.020426123205187587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006896913982927799,
      "kl": 0.0009931822860380635,
      "learning_rate": 9.959240389069013e-07,
      "loss": 0.0,
      "num_tokens": 12320538.0,
      "reward": 0.09207873791456223,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.09207873791456223,
      "rewards/reward_func/std": 0.0,
      "step": 441,
      "step_time": 16.85022407770157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 135.0625,
      "completions/mean_terminated_length": 135.0625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.32932811975479126,
      "epoch": 0.02047244094488189,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010856821900233626,
      "kl": 0.001221887971041724,
      "learning_rate": 9.959147753589624e-07,
      "loss": 0.0001,
      "num_tokens": 12343419.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 442,
      "step_time": 16.05213586986065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 128.0,
      "completions/mean_terminated_length": 128.0,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.29498112946748734,
      "epoch": 0.020518758684576192,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008743335492908955,
      "kl": 0.0013112361484672874,
      "learning_rate": 9.959055118110235e-07,
      "loss": 0.0001,
      "num_tokens": 12365243.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 443,
      "step_time": 15.129100944846869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 127.0625,
      "completions/mean_terminated_length": 127.0625,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.3172316774725914,
      "epoch": 0.020565076424270495,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007861247868277133,
      "kl": 0.001013173081446439,
      "learning_rate": 9.958962482630846e-07,
      "loss": 0.0001,
      "num_tokens": 12401052.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 444,
      "step_time": 18.39372304826975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 179.0625,
      "completions/mean_terminated_length": 179.0625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2586454674601555,
      "epoch": 0.020611394163964798,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07765398174524307,
      "kl": 0.0010438812023494393,
      "learning_rate": 9.95886984715146e-07,
      "loss": -0.0045,
      "num_tokens": 12425165.0,
      "reward": 0.9533541202545166,
      "reward_std": 0.07145605236291885,
      "rewards/reward_func/mean": 0.9533541202545166,
      "rewards/reward_func/std": 0.07145605981349945,
      "step": 445,
      "step_time": 19.860888108611107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 151.3125,
      "completions/mean_terminated_length": 151.3125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.27377403527498245,
      "epoch": 0.0206577119036591,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006911946693435311,
      "kl": 0.000904399566934444,
      "learning_rate": 9.95877721167207e-07,
      "loss": 0.0,
      "num_tokens": 12446450.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 446,
      "step_time": 17.764220606535673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 156.625,
      "completions/mean_terminated_length": 156.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.4193510413169861,
      "epoch": 0.020704029643353404,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007914117304608226,
      "kl": 0.00129216568893753,
      "learning_rate": 9.958684576192682e-07,
      "loss": 0.0001,
      "num_tokens": 12470876.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 447,
      "step_time": 17.249667938798666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 206.0625,
      "completions/mean_terminated_length": 206.0625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.21897048875689507,
      "epoch": 0.020750347383047707,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007031591376289725,
      "kl": 0.0008966055902419612,
      "learning_rate": 9.958591940713291e-07,
      "loss": 0.0,
      "num_tokens": 12508829.0,
      "reward": 0.7165313363075256,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7165313363075256,
      "rewards/reward_func/std": 0.0,
      "step": 448,
      "step_time": 23.306960482150316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 157.75,
      "completions/mean_terminated_length": 157.75,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.37646690756082535,
      "epoch": 0.02079666512274201,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007410895777866244,
      "kl": 0.0014292299165390432,
      "learning_rate": 9.958499305233905e-07,
      "loss": 0.0001,
      "num_tokens": 12538409.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 449,
      "step_time": 18.985281493514776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 149.0625,
      "completions/mean_terminated_length": 149.0625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2667747214436531,
      "epoch": 0.020842982862436313,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006505842902697623,
      "kl": 0.0009782446722965688,
      "learning_rate": 9.958406669754516e-07,
      "loss": 0.0,
      "num_tokens": 12559786.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 450,
      "step_time": 15.077436048537493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 172.875,
      "completions/mean_terminated_length": 172.875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.39849378913640976,
      "epoch": 0.020889300602130616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008802087977528572,
      "kl": 0.0013437908492051065,
      "learning_rate": 9.958314034275127e-07,
      "loss": 0.0001,
      "num_tokens": 12583704.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 451,
      "step_time": 18.90094792470336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 118.625,
      "completions/mean_terminated_length": 118.625,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.2805650979280472,
      "epoch": 0.02093561834182492,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007345624617300928,
      "kl": 0.001147609727922827,
      "learning_rate": 9.958221398795738e-07,
      "loss": 0.0001,
      "num_tokens": 12605346.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 452,
      "step_time": 13.880728926509619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 206.5625,
      "completions/mean_terminated_length": 206.5625,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.26133378595113754,
      "epoch": 0.02098193608151922,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008001459063962102,
      "kl": 0.0011924623395316303,
      "learning_rate": 9.95812876331635e-07,
      "loss": 0.0001,
      "num_tokens": 12642955.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 453,
      "step_time": 24.59642492234707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 332.0,
      "completions/max_terminated_length": 332.0,
      "completions/mean_length": 229.1875,
      "completions/mean_terminated_length": 229.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.37597081810235977,
      "epoch": 0.021028253821213524,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07688658684492111,
      "kl": 0.0026821557548828423,
      "learning_rate": 9.95803612783696e-07,
      "loss": -0.2149,
      "num_tokens": 12676414.0,
      "reward": 0.3576716184616089,
      "reward_std": 0.47980526089668274,
      "rewards/reward_func/mean": 0.3576716184616089,
      "rewards/reward_func/std": 0.47980526089668274,
      "step": 454,
      "step_time": 30.136327359825373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 182.5,
      "completions/mean_terminated_length": 182.5,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.39297641068696976,
      "epoch": 0.021074571560907827,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009784556459635496,
      "kl": 0.0014437072968576103,
      "learning_rate": 9.957943492357572e-07,
      "loss": 0.0001,
      "num_tokens": 12708710.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 455,
      "step_time": 22.129728976637125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 172.6875,
      "completions/mean_terminated_length": 172.6875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.11051014810800552,
      "epoch": 0.02112088930060213,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00026434153551235795,
      "kl": 0.00039409926830558106,
      "learning_rate": 9.957850856878183e-07,
      "loss": 0.0,
      "num_tokens": 12743441.0,
      "reward": 0.5044883489608765,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5044883489608765,
      "rewards/reward_func/std": 0.0,
      "step": 456,
      "step_time": 19.395647291094065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 149.125,
      "completions/mean_terminated_length": 149.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.4058089703321457,
      "epoch": 0.021167207040296433,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007130375597625971,
      "kl": 0.0013419112074188888,
      "learning_rate": 9.957758221398795e-07,
      "loss": 0.0001,
      "num_tokens": 12785155.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 457,
      "step_time": 20.202812299132347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 163.625,
      "completions/mean_terminated_length": 163.625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.40577686578035355,
      "epoch": 0.021213524779990736,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009446601616218686,
      "kl": 0.0013670599437318742,
      "learning_rate": 9.957665585919406e-07,
      "loss": 0.0001,
      "num_tokens": 12811517.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 458,
      "step_time": 19.652188416570425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 161.875,
      "completions/mean_terminated_length": 161.875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.40780486911535263,
      "epoch": 0.02125984251968504,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006535582942888141,
      "kl": 0.0013698584225494415,
      "learning_rate": 9.95757295044002e-07,
      "loss": 0.0001,
      "num_tokens": 12844875.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 459,
      "step_time": 20.958265770226717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 127.8125,
      "completions/mean_terminated_length": 127.8125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2913421094417572,
      "epoch": 0.021306160259379342,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007478291518054903,
      "kl": 0.0011158910201629624,
      "learning_rate": 9.957480314960628e-07,
      "loss": 0.0001,
      "num_tokens": 12868040.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 460,
      "step_time": 16.351258099079132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 128.5625,
      "completions/mean_terminated_length": 128.5625,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2619505301117897,
      "epoch": 0.021352477999073645,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005713357822969556,
      "kl": 0.0010241223353659734,
      "learning_rate": 9.95738767948124e-07,
      "loss": 0.0001,
      "num_tokens": 12889377.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 461,
      "step_time": 13.749565534293652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 120.9375,
      "completions/mean_terminated_length": 120.9375,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.3522017151117325,
      "epoch": 0.021398795738767948,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001095265382900834,
      "kl": 0.0015934919065330178,
      "learning_rate": 9.957295044001853e-07,
      "loss": 0.0001,
      "num_tokens": 12911456.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 462,
      "step_time": 14.89537600800395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 198.5625,
      "completions/mean_terminated_length": 198.5625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.39398887008428574,
      "epoch": 0.02144511347846225,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012047621421515942,
      "kl": 0.0014141463616397232,
      "learning_rate": 9.957202408522464e-07,
      "loss": 0.0001,
      "num_tokens": 12940873.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 463,
      "step_time": 22.350401777774096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 146.3125,
      "completions/mean_terminated_length": 146.3125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3431501239538193,
      "epoch": 0.021491431218156554,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0072228494100272655,
      "kl": 0.004282706417143345,
      "learning_rate": 9.957109773043076e-07,
      "loss": 0.0002,
      "num_tokens": 12969886.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 464,
      "step_time": 17.743618417531252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 144.125,
      "completions/mean_terminated_length": 144.125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.16300039738416672,
      "epoch": 0.021537748957850857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005739906919188797,
      "kl": 0.0008377209451282397,
      "learning_rate": 9.957017137563687e-07,
      "loss": 0.0,
      "num_tokens": 12991456.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 465,
      "step_time": 16.602536369115114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 207.0,
      "completions/mean_terminated_length": 207.0,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.3950920030474663,
      "epoch": 0.02158406669754516,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07530498504638672,
      "kl": 0.001105832023313269,
      "learning_rate": 9.956924502084298e-07,
      "loss": -0.0647,
      "num_tokens": 13014336.0,
      "reward": 0.21374839544296265,
      "reward_std": 0.2851552963256836,
      "rewards/reward_func/mean": 0.21374839544296265,
      "rewards/reward_func/std": 0.2851552963256836,
      "step": 466,
      "step_time": 24.43825474753976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 182.125,
      "completions/mean_terminated_length": 182.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.32652004808187485,
      "epoch": 0.021630384437239462,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004810943210031837,
      "kl": 0.0010445124644320458,
      "learning_rate": 9.95683186660491e-07,
      "loss": 0.0001,
      "num_tokens": 13039746.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 467,
      "step_time": 18.553403332829475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 152.6875,
      "completions/mean_terminated_length": 152.6875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.2733049914240837,
      "epoch": 0.021676702176933765,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011022585676982999,
      "kl": 0.0011483504786156118,
      "learning_rate": 9.95673923112552e-07,
      "loss": 0.0001,
      "num_tokens": 13066621.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 468,
      "step_time": 16.856421019881964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 341.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 249.375,
      "completions/mean_terminated_length": 249.375,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.2896248959004879,
      "epoch": 0.021723019916628068,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05923737585544586,
      "kl": 0.0010491950379218906,
      "learning_rate": 9.956646595646132e-07,
      "loss": -0.0386,
      "num_tokens": 13096899.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 469,
      "step_time": 29.257147755473852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 123.0625,
      "completions/mean_terminated_length": 123.0625,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.287365585565567,
      "epoch": 0.02176933765632237,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010540344519540668,
      "kl": 0.0010919710621237755,
      "learning_rate": 9.956553960166743e-07,
      "loss": 0.0001,
      "num_tokens": 13116612.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 470,
      "step_time": 14.792682588100433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 130.0625,
      "completions/mean_terminated_length": 130.0625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.27139056473970413,
      "epoch": 0.021815655396016674,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026716021820902824,
      "kl": 0.001344893971690908,
      "learning_rate": 9.956461324687354e-07,
      "loss": 0.0001,
      "num_tokens": 13140085.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 471,
      "step_time": 15.204796615988016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 231.8125,
      "completions/mean_terminated_length": 231.8125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.30421893298625946,
      "epoch": 0.021861973135710977,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06558000296354294,
      "kl": 0.000971016037510708,
      "learning_rate": 9.956368689207966e-07,
      "loss": -0.0746,
      "num_tokens": 13171570.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 472,
      "step_time": 25.866084907203913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 283.3125,
      "completions/mean_terminated_length": 283.3125,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.471021831035614,
      "epoch": 0.02190829087540528,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05412129685282707,
      "kl": 0.0016661942936480045,
      "learning_rate": 9.956276053728577e-07,
      "loss": 0.0941,
      "num_tokens": 13200359.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 473,
      "step_time": 33.09069042280316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 129.6875,
      "completions/mean_terminated_length": 129.6875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.24906113743782043,
      "epoch": 0.021954608615099583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006597902975045145,
      "kl": 0.0008146143518388271,
      "learning_rate": 9.956183418249188e-07,
      "loss": 0.0,
      "num_tokens": 13225858.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 474,
      "step_time": 15.069956101477146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 115.75,
      "completions/mean_terminated_length": 115.75,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.24458341300487518,
      "epoch": 0.022000926354793886,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016917382599785924,
      "kl": 0.0013197254738770425,
      "learning_rate": 9.956090782769801e-07,
      "loss": 0.0001,
      "num_tokens": 13245422.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 475,
      "step_time": 13.75100864097476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 184.125,
      "completions/mean_terminated_length": 184.125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.2562222480773926,
      "epoch": 0.02204724409448819,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09810595959424973,
      "kl": 0.0012838072143495083,
      "learning_rate": 9.955998147290413e-07,
      "loss": -0.0063,
      "num_tokens": 13277136.0,
      "reward": 0.5895459055900574,
      "reward_std": 0.0756286084651947,
      "rewards/reward_func/mean": 0.5895459055900574,
      "rewards/reward_func/std": 0.0756286159157753,
      "step": 476,
      "step_time": 20.08931627869606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 157.4375,
      "completions/mean_terminated_length": 157.4375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.17066820710897446,
      "epoch": 0.02209356183418249,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06698978692293167,
      "kl": 0.0012453247618395835,
      "learning_rate": 9.955905511811024e-07,
      "loss": -0.0196,
      "num_tokens": 13298839.0,
      "reward": 0.8975909948348999,
      "reward_std": 0.027001073583960533,
      "rewards/reward_func/mean": 0.8975909948348999,
      "rewards/reward_func/std": 0.027001069858670235,
      "step": 477,
      "step_time": 15.904055442661047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 367.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 244.75,
      "completions/mean_terminated_length": 244.75,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.37555670738220215,
      "epoch": 0.022139879573876795,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.058952413499355316,
      "kl": 0.0011130543425679207,
      "learning_rate": 9.955812876331633e-07,
      "loss": -0.2187,
      "num_tokens": 13326419.0,
      "reward": 0.35912901163101196,
      "reward_std": 0.46907129883766174,
      "rewards/reward_func/mean": 0.35912901163101196,
      "rewards/reward_func/std": 0.46907132863998413,
      "step": 478,
      "step_time": 30.928753718733788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 116.125,
      "completions/mean_terminated_length": 116.125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.3450111076235771,
      "epoch": 0.022186197313571097,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016331137157976627,
      "kl": 0.001576541137183085,
      "learning_rate": 9.955720240852246e-07,
      "loss": 0.0001,
      "num_tokens": 13362325.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 479,
      "step_time": 16.370044983923435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 233.4375,
      "completions/mean_terminated_length": 233.4375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.25906096026301384,
      "epoch": 0.0222325150532654,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07396862655878067,
      "kl": 0.0010420548933325335,
      "learning_rate": 9.955627605372858e-07,
      "loss": -0.1119,
      "num_tokens": 13402044.0,
      "reward": 0.1792462170124054,
      "reward_std": 0.33070844411849976,
      "rewards/reward_func/mean": 0.1792462170124054,
      "rewards/reward_func/std": 0.33070847392082214,
      "step": 480,
      "step_time": 29.113693229854107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 139.8125,
      "completions/mean_terminated_length": 139.8125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.34055541455745697,
      "epoch": 0.022278832792959703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013534241588786244,
      "kl": 0.001348479330772534,
      "learning_rate": 9.955534969893469e-07,
      "loss": 0.0001,
      "num_tokens": 13437481.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 481,
      "step_time": 19.76912584528327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 177.25,
      "completions/mean_terminated_length": 177.25,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.42396819591522217,
      "epoch": 0.022325150532654006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005905788857489824,
      "kl": 0.0013149116421118379,
      "learning_rate": 9.95544233441408e-07,
      "loss": 0.0001,
      "num_tokens": 13459581.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 482,
      "step_time": 19.408626589924097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 182.625,
      "completions/mean_terminated_length": 182.625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4352477863430977,
      "epoch": 0.02237146827234831,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007822702755220234,
      "kl": 0.001167302776593715,
      "learning_rate": 9.955349698934691e-07,
      "loss": 0.0001,
      "num_tokens": 13482039.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 483,
      "step_time": 22.192662086337805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 184.75,
      "completions/mean_terminated_length": 184.75,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.2297377660870552,
      "epoch": 0.022417786012042612,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005211597308516502,
      "kl": 0.0008147654589265585,
      "learning_rate": 9.955257063455303e-07,
      "loss": 0.0,
      "num_tokens": 13510611.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 484,
      "step_time": 20.270441822707653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 176.25,
      "completions/mean_terminated_length": 176.25,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3842536062002182,
      "epoch": 0.022464103751736915,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008047992596402764,
      "kl": 0.0013092006847728044,
      "learning_rate": 9.955164427975914e-07,
      "loss": 0.0001,
      "num_tokens": 13536119.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 485,
      "step_time": 20.586454547941685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 133.3125,
      "completions/mean_terminated_length": 133.3125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.27248460054397583,
      "epoch": 0.022510421491431218,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005648453952744603,
      "kl": 0.0011865806591231376,
      "learning_rate": 9.955071792496525e-07,
      "loss": 0.0001,
      "num_tokens": 13555804.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 486,
      "step_time": 14.476582117378712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 125.1875,
      "completions/mean_terminated_length": 125.1875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.27011676877737045,
      "epoch": 0.02255673923112552,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001830546883866191,
      "kl": 0.0018114371341653168,
      "learning_rate": 9.954979157017136e-07,
      "loss": 0.0001,
      "num_tokens": 13575711.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 487,
      "step_time": 13.421096365898848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 197.75,
      "completions/mean_terminated_length": 197.75,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.37179628759622574,
      "epoch": 0.022603056970819824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001344718155451119,
      "kl": 0.001363539631711319,
      "learning_rate": 9.954886521537748e-07,
      "loss": 0.0001,
      "num_tokens": 13612651.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 488,
      "step_time": 23.876486036926508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 126.9375,
      "completions/mean_terminated_length": 126.9375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.3013797104358673,
      "epoch": 0.022649374710514127,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032518936786800623,
      "kl": 0.0018434434314258397,
      "learning_rate": 9.95479388605836e-07,
      "loss": 0.0001,
      "num_tokens": 13632490.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 489,
      "step_time": 15.357919406145811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 149.6875,
      "completions/mean_terminated_length": 149.6875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.32749275863170624,
      "epoch": 0.02269569245020843,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018983208574354649,
      "kl": 0.0014090609329286963,
      "learning_rate": 9.954701250578972e-07,
      "loss": 0.0001,
      "num_tokens": 13673333.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 490,
      "step_time": 21.186318166553974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 132.1875,
      "completions/mean_terminated_length": 132.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2657178193330765,
      "epoch": 0.022742010189902732,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010834588902071118,
      "kl": 0.001249116670805961,
      "learning_rate": 9.954608615099581e-07,
      "loss": 0.0001,
      "num_tokens": 13694792.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 491,
      "step_time": 14.536582689732313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 146.875,
      "completions/mean_terminated_length": 146.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.33631060272455215,
      "epoch": 0.022788327929597035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001670774887315929,
      "kl": 0.0016863863274920732,
      "learning_rate": 9.954515979620195e-07,
      "loss": 0.0001,
      "num_tokens": 13724054.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 492,
      "step_time": 17.81245766952634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 319.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 263.5,
      "completions/mean_terminated_length": 263.5,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.4623207077383995,
      "epoch": 0.02283464566929134,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06237829104065895,
      "kl": 0.001233977556694299,
      "learning_rate": 9.954423344140806e-07,
      "loss": 0.0939,
      "num_tokens": 13751518.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 493,
      "step_time": 27.27083507925272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 193.6875,
      "completions/mean_terminated_length": 193.6875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.2014639787375927,
      "epoch": 0.02288096340898564,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05373659357428551,
      "kl": 0.000706080420059152,
      "learning_rate": 9.954330708661417e-07,
      "loss": 0.0154,
      "num_tokens": 13774441.0,
      "reward": 0.9466155767440796,
      "reward_std": 0.02992909401655197,
      "rewards/reward_func/mean": 0.9466155767440796,
      "rewards/reward_func/std": 0.029929067939519882,
      "step": 494,
      "step_time": 19.53807992488146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 166.0625,
      "completions/mean_terminated_length": 166.0625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.407911978662014,
      "epoch": 0.022927281148679944,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012372437631711364,
      "kl": 0.0017013985197991133,
      "learning_rate": 9.954238073182029e-07,
      "loss": 0.0001,
      "num_tokens": 13804762.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 495,
      "step_time": 19.689786564558744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 132.3125,
      "completions/mean_terminated_length": 132.3125,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.3699936494231224,
      "epoch": 0.022973598888374247,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007731465739198029,
      "kl": 0.0011656057758955285,
      "learning_rate": 9.95414543770264e-07,
      "loss": 0.0001,
      "num_tokens": 13827263.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 496,
      "step_time": 16.126573752611876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 194.6875,
      "completions/mean_terminated_length": 194.6875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3789226859807968,
      "epoch": 0.02301991662806855,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07127277553081512,
      "kl": 0.008387453213799745,
      "learning_rate": 9.95405280222325e-07,
      "loss": -0.0577,
      "num_tokens": 13856154.0,
      "reward": 0.03359834849834442,
      "reward_std": 0.1343933790922165,
      "rewards/reward_func/mean": 0.03359834849834442,
      "rewards/reward_func/std": 0.13439339399337769,
      "step": 497,
      "step_time": 22.471235185861588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 125.4375,
      "completions/mean_terminated_length": 125.4375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3219013437628746,
      "epoch": 0.023066234367762853,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009430369827896357,
      "kl": 0.0014280521718319505,
      "learning_rate": 9.953960166743862e-07,
      "loss": 0.0001,
      "num_tokens": 13879857.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 498,
      "step_time": 14.805292218923569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 124.75,
      "completions/mean_terminated_length": 124.75,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.3636002764105797,
      "epoch": 0.023112552107457156,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008227949729189277,
      "kl": 0.0012659696803893894,
      "learning_rate": 9.953867531264474e-07,
      "loss": 0.0001,
      "num_tokens": 13916253.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 499,
      "step_time": 18.45577147603035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 316.0,
      "completions/max_terminated_length": 316.0,
      "completions/mean_length": 232.3125,
      "completions/mean_terminated_length": 232.3125,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.38027258962392807,
      "epoch": 0.02315886984715146,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0514298640191555,
      "kl": 0.0013120945077389479,
      "learning_rate": 9.953774895785085e-07,
      "loss": -0.1189,
      "num_tokens": 13950914.0,
      "reward": 0.1100812703371048,
      "reward_std": 0.30393990874290466,
      "rewards/reward_func/mean": 0.1100812703371048,
      "rewards/reward_func/std": 0.30393993854522705,
      "step": 500,
      "step_time": 28.57885792478919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 192.5625,
      "completions/mean_terminated_length": 192.5625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.20426442846655846,
      "epoch": 0.02320518758684576,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004283256712369621,
      "kl": 0.0006900559674249962,
      "learning_rate": 9.953682260305696e-07,
      "loss": 0.0,
      "num_tokens": 13978923.0,
      "reward": 0.9555630087852478,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9555630087852478,
      "rewards/reward_func/std": 0.0,
      "step": 501,
      "step_time": 20.524554256349802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 143.375,
      "completions/mean_terminated_length": 143.375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.1600031666457653,
      "epoch": 0.023251505326540065,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004713285597972572,
      "kl": 0.0007226398593047634,
      "learning_rate": 9.95358962482631e-07,
      "loss": 0.0,
      "num_tokens": 13999361.0,
      "reward": 0.5697828531265259,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5697828531265259,
      "rewards/reward_func/std": 0.0,
      "step": 502,
      "step_time": 14.741988241672516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 242.875,
      "completions/mean_terminated_length": 242.875,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "entropy": 0.2466079480946064,
      "epoch": 0.023297823066234367,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005793596501462162,
      "kl": 0.0008950445044320077,
      "learning_rate": 9.953496989346918e-07,
      "loss": 0.0,
      "num_tokens": 14030511.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 503,
      "step_time": 24.83066874742508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 128.5625,
      "completions/mean_terminated_length": 128.5625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.3423362225294113,
      "epoch": 0.02334414080592867,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007454920560121536,
      "kl": 0.001252439513336867,
      "learning_rate": 9.95340435386753e-07,
      "loss": 0.0001,
      "num_tokens": 14050760.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 504,
      "step_time": 15.452152878046036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 184.625,
      "completions/mean_terminated_length": 184.625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.37029866874217987,
      "epoch": 0.023390458545622973,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006513205007649958,
      "kl": 0.0011104259465355426,
      "learning_rate": 9.953311718388143e-07,
      "loss": 0.0001,
      "num_tokens": 14072466.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 505,
      "step_time": 17.8039546944201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 140.6875,
      "completions/mean_terminated_length": 140.6875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.19547295570373535,
      "epoch": 0.023436776285317276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007712005754001439,
      "kl": 0.0010060306667583063,
      "learning_rate": 9.953219082908754e-07,
      "loss": 0.0001,
      "num_tokens": 14097037.0,
      "reward": 0.4203503727912903,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4203503727912903,
      "rewards/reward_func/std": 0.0,
      "step": 506,
      "step_time": 15.219443429261446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 146.75,
      "completions/mean_terminated_length": 146.75,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3754529803991318,
      "epoch": 0.02348309402501158,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011129234917461872,
      "kl": 0.0015130334359128028,
      "learning_rate": 9.953126447429366e-07,
      "loss": 0.0001,
      "num_tokens": 14141049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 507,
      "step_time": 22.853277012705803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 181.9375,
      "completions/mean_terminated_length": 181.9375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.4032375067472458,
      "epoch": 0.023529411764705882,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001507847453467548,
      "kl": 0.0017335480370093137,
      "learning_rate": 9.953033811949977e-07,
      "loss": 0.0001,
      "num_tokens": 14173304.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 508,
      "step_time": 22.368615679442883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 302.6875,
      "completions/mean_terminated_length": 302.6875,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "entropy": 0.25965646654367447,
      "epoch": 0.023575729504400185,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05408618599176407,
      "kl": 0.0009873932285699993,
      "learning_rate": 9.952941176470588e-07,
      "loss": 0.0077,
      "num_tokens": 14204499.0,
      "reward": 0.923896312713623,
      "reward_std": 0.015388688072562218,
      "rewards/reward_func/mean": 0.923896312713623,
      "rewards/reward_func/std": 0.015388698317110538,
      "step": 509,
      "step_time": 32.79489414393902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 107.8125,
      "completions/mean_terminated_length": 107.8125,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 0.27770448476076126,
      "epoch": 0.023622047244094488,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007120308000594378,
      "kl": 0.001090057980036363,
      "learning_rate": 9.9528485409912e-07,
      "loss": 0.0001,
      "num_tokens": 14224224.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 510,
      "step_time": 12.823708292096853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 133.9375,
      "completions/mean_terminated_length": 133.9375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3049366846680641,
      "epoch": 0.02366836498378879,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012418744154274464,
      "kl": 0.0012951338139828295,
      "learning_rate": 9.95275590551181e-07,
      "loss": 0.0001,
      "num_tokens": 14260191.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 511,
      "step_time": 17.79165342450142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 122.0,
      "completions/mean_terminated_length": 122.0,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.29965297132730484,
      "epoch": 0.023714682723483094,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014361172216013074,
      "kl": 0.0014124661247478798,
      "learning_rate": 9.952663270032422e-07,
      "loss": 0.0001,
      "num_tokens": 14280591.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 512,
      "step_time": 14.293915253132582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 157.0,
      "completions/mean_terminated_length": 157.0,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.16042153909802437,
      "epoch": 0.023761000463177397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005578518030233681,
      "kl": 0.0007172806363087147,
      "learning_rate": 9.952570634553033e-07,
      "loss": 0.0,
      "num_tokens": 14301999.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 513,
      "step_time": 16.87550877034664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 139.5,
      "completions/mean_terminated_length": 139.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.27408745139837265,
      "epoch": 0.0238073182028717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013380669988691807,
      "kl": 0.001637225184822455,
      "learning_rate": 9.952477999073644e-07,
      "loss": 0.0001,
      "num_tokens": 14323767.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 514,
      "step_time": 15.414053175598383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 201.125,
      "completions/mean_terminated_length": 201.125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.37802645564079285,
      "epoch": 0.023853635942566002,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06617993861436844,
      "kl": 0.000993274530628696,
      "learning_rate": 9.952385363594256e-07,
      "loss": -0.0897,
      "num_tokens": 14345673.0,
      "reward": 0.2039203941822052,
      "reward_std": 0.3123822808265686,
      "rewards/reward_func/mean": 0.2039203941822052,
      "rewards/reward_func/std": 0.3123822510242462,
      "step": 515,
      "step_time": 23.67078560963273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 156.875,
      "completions/mean_terminated_length": 156.875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.19579781964421272,
      "epoch": 0.023899953682260305,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0758872702717781,
      "kl": 0.0009364501020172611,
      "learning_rate": 9.952292728114867e-07,
      "loss": -0.0277,
      "num_tokens": 14369527.0,
      "reward": 0.8958791494369507,
      "reward_std": 0.04135281220078468,
      "rewards/reward_func/mean": 0.8958791494369507,
      "rewards/reward_func/std": 0.041352808475494385,
      "step": 516,
      "step_time": 16.928487829864025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 132.0,
      "completions/mean_terminated_length": 132.0,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.29442620277404785,
      "epoch": 0.02394627142195461,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006312267505563796,
      "kl": 0.0012187010725028813,
      "learning_rate": 9.952200092635478e-07,
      "loss": 0.0001,
      "num_tokens": 14391319.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 517,
      "step_time": 15.052225556224585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 138.8125,
      "completions/mean_terminated_length": 138.8125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.24572154507040977,
      "epoch": 0.02399258916164891,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008521306444890797,
      "kl": 0.0010103998938575387,
      "learning_rate": 9.95210745715609e-07,
      "loss": 0.0001,
      "num_tokens": 14412100.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 518,
      "step_time": 14.790097005665302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 163.6875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3711855635046959,
      "epoch": 0.024038906901343214,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004503102391026914,
      "kl": 0.001057341054547578,
      "learning_rate": 9.952014821676703e-07,
      "loss": 0.0001,
      "num_tokens": 14436751.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 519,
      "step_time": 17.75486048310995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 144.9375,
      "completions/mean_terminated_length": 144.9375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.2414422668516636,
      "epoch": 0.024085224641037517,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006374597433023155,
      "kl": 0.0008082981657935306,
      "learning_rate": 9.951922186197314e-07,
      "loss": 0.0,
      "num_tokens": 14456766.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 520,
      "step_time": 15.141897857189178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 305.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 193.5625,
      "completions/mean_terminated_length": 193.5625,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.41844432801008224,
      "epoch": 0.02413154238073182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013825239147990942,
      "kl": 0.0015546581416856498,
      "learning_rate": 9.951829550717923e-07,
      "loss": 0.0001,
      "num_tokens": 14478535.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 521,
      "step_time": 25.464532054960728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 115.25,
      "completions/mean_terminated_length": 115.25,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.26581552624702454,
      "epoch": 0.024177860120426123,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017415828770026565,
      "kl": 0.0015278060163836926,
      "learning_rate": 9.951736915238536e-07,
      "loss": 0.0001,
      "num_tokens": 14497899.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 522,
      "step_time": 13.22960414364934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 176.5625,
      "completions/mean_terminated_length": 176.5625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.34090740233659744,
      "epoch": 0.024224177860120426,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005656389403156936,
      "kl": 0.0011169943463755772,
      "learning_rate": 9.951644279759148e-07,
      "loss": 0.0001,
      "num_tokens": 14524932.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 523,
      "step_time": 21.30846729502082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 146.125,
      "completions/mean_terminated_length": 146.125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.18956785276532173,
      "epoch": 0.02427049559981473,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000552937388420105,
      "kl": 0.0009237581252818927,
      "learning_rate": 9.95155164427976e-07,
      "loss": 0.0,
      "num_tokens": 14545590.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 524,
      "step_time": 15.692125141620636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 337.0,
      "completions/max_terminated_length": 337.0,
      "completions/mean_length": 280.8125,
      "completions/mean_terminated_length": 280.8125,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "entropy": 0.1719597429037094,
      "epoch": 0.02431681333950903,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04176125302910805,
      "kl": 0.0007282002043211833,
      "learning_rate": 9.95145900880037e-07,
      "loss": -0.1117,
      "num_tokens": 14578867.0,
      "reward": 0.6490556001663208,
      "reward_std": 0.3030133843421936,
      "rewards/reward_func/mean": 0.6490556001663208,
      "rewards/reward_func/std": 0.3030133843421936,
      "step": 525,
      "step_time": 30.006157591938972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 195.9375,
      "completions/mean_terminated_length": 195.9375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.2237008437514305,
      "epoch": 0.024363131079203335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009089101804420352,
      "kl": 0.0010247548634652048,
      "learning_rate": 9.951366373320981e-07,
      "loss": 0.0001,
      "num_tokens": 14609650.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 526,
      "step_time": 21.09304867312312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 138.375,
      "completions/mean_terminated_length": 138.375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.1688799485564232,
      "epoch": 0.024409448818897637,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007246082532219589,
      "kl": 0.0008973616058938205,
      "learning_rate": 9.951273737841593e-07,
      "loss": 0.0,
      "num_tokens": 14642344.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 527,
      "step_time": 17.77213069051504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 205.4375,
      "completions/mean_terminated_length": 205.4375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.41943180561065674,
      "epoch": 0.02445576655859194,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006336580845527351,
      "kl": 0.0012316417414695024,
      "learning_rate": 9.951181102362204e-07,
      "loss": 0.0001,
      "num_tokens": 14667151.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 528,
      "step_time": 22.779205039143562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 132.125,
      "completions/mean_terminated_length": 132.125,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2875938192009926,
      "epoch": 0.024502084298286243,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009506710339337587,
      "kl": 0.0011937796953134239,
      "learning_rate": 9.951088466882815e-07,
      "loss": 0.0001,
      "num_tokens": 14689745.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 529,
      "step_time": 15.668328743427992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 112.6875,
      "completions/mean_terminated_length": 112.6875,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.2612050920724869,
      "epoch": 0.024548402037980546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003256069030612707,
      "kl": 0.0018359064706601202,
      "learning_rate": 9.950995831403426e-07,
      "loss": 0.0001,
      "num_tokens": 14709964.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 530,
      "step_time": 12.744639925658703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 171.5625,
      "completions/mean_terminated_length": 171.5625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3694435805082321,
      "epoch": 0.02459471977767485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018149253446608782,
      "kl": 0.0023198953131213784,
      "learning_rate": 9.950903195924038e-07,
      "loss": 0.0001,
      "num_tokens": 14738821.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 531,
      "step_time": 20.87821977958083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 147.0625,
      "completions/mean_terminated_length": 147.0625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2528420351445675,
      "epoch": 0.024641037517369152,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07727481424808502,
      "kl": 0.0011586096952669322,
      "learning_rate": 9.950810560444651e-07,
      "loss": 0.0991,
      "num_tokens": 14761974.0,
      "reward": 0.9899153709411621,
      "reward_std": 0.027556534856557846,
      "rewards/reward_func/mean": 0.9899153709411621,
      "rewards/reward_func/std": 0.027556534856557846,
      "step": 532,
      "step_time": 17.73023172095418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 214.875,
      "completions/mean_terminated_length": 214.875,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.36899401247501373,
      "epoch": 0.024687355257063455,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08863791823387146,
      "kl": 0.0012757852673530579,
      "learning_rate": 9.950717924965262e-07,
      "loss": -0.0945,
      "num_tokens": 14787492.0,
      "reward": 0.4648560881614685,
      "reward_std": 0.4801013171672821,
      "rewards/reward_func/mean": 0.4648560881614685,
      "rewards/reward_func/std": 0.4801013171672821,
      "step": 533,
      "step_time": 24.023204747587442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 222.75,
      "completions/mean_terminated_length": 222.75,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.2912716940045357,
      "epoch": 0.024733672996757758,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06058639660477638,
      "kl": 0.0009023299644468352,
      "learning_rate": 9.950625289485871e-07,
      "loss": -0.0476,
      "num_tokens": 14820624.0,
      "reward": 0.8208504915237427,
      "reward_std": 0.24021044373512268,
      "rewards/reward_func/mean": 0.8208504915237427,
      "rewards/reward_func/std": 0.24021045863628387,
      "step": 534,
      "step_time": 25.64229280874133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 218.25,
      "completions/mean_terminated_length": 218.25,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4075919911265373,
      "epoch": 0.02477999073645206,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06277602910995483,
      "kl": 0.001133722806116566,
      "learning_rate": 9.950532654006485e-07,
      "loss": -0.0851,
      "num_tokens": 14848068.0,
      "reward": 0.3356561064720154,
      "reward_std": 0.4581764340400696,
      "rewards/reward_func/mean": 0.3356561064720154,
      "rewards/reward_func/std": 0.45817646384239197,
      "step": 535,
      "step_time": 23.9300565905869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 126.375,
      "completions/mean_terminated_length": 126.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.24887803941965103,
      "epoch": 0.024826308476146364,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006139138713479042,
      "kl": 0.0011030811874661595,
      "learning_rate": 9.950440018527096e-07,
      "loss": 0.0001,
      "num_tokens": 14867770.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 536,
      "step_time": 13.15186096355319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.24105515331029892,
      "epoch": 0.024872626215840667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001104863709770143,
      "kl": 0.0011861571983899921,
      "learning_rate": 9.950347383047707e-07,
      "loss": 0.0001,
      "num_tokens": 14887486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 537,
      "step_time": 13.874327003955841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.17102652788162231,
      "epoch": 0.02491894395553497,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06881299614906311,
      "kl": 0.0006279955705394968,
      "learning_rate": 9.950254747568319e-07,
      "loss": 0.0741,
      "num_tokens": 14908040.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 538,
      "step_time": 16.973441254347563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 142.625,
      "completions/mean_terminated_length": 142.625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2013726904988289,
      "epoch": 0.024965261695229272,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014457677025347948,
      "kl": 0.001240854966454208,
      "learning_rate": 9.95016211208893e-07,
      "loss": 0.0001,
      "num_tokens": 14929746.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 539,
      "step_time": 15.499807421118021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 137.25,
      "completions/mean_terminated_length": 137.25,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.22709523141384125,
      "epoch": 0.025011579434923575,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004049050621688366,
      "kl": 0.0022631227038800716,
      "learning_rate": 9.950069476609541e-07,
      "loss": 0.0001,
      "num_tokens": 14949382.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 540,
      "step_time": 14.647448178380728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 149.25,
      "completions/mean_terminated_length": 149.25,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.41529327630996704,
      "epoch": 0.02505789717461788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012068174546584487,
      "kl": 0.0019127448613289744,
      "learning_rate": 9.949976841130152e-07,
      "loss": 0.0001,
      "num_tokens": 14996570.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 541,
      "step_time": 21.505676943808794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 125.0,
      "completions/mean_terminated_length": 125.0,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27272979170084,
      "epoch": 0.02510421491431218,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013653446221724153,
      "kl": 0.0012410607014317065,
      "learning_rate": 9.949884205650764e-07,
      "loss": 0.0001,
      "num_tokens": 15018394.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 542,
      "step_time": 14.302406802773476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.5625,
      "completions/mean_terminated_length": 179.5625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3766372799873352,
      "epoch": 0.025150532654006484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007904847734607756,
      "kl": 0.0015201152418740094,
      "learning_rate": 9.949791570171375e-07,
      "loss": 0.0001,
      "num_tokens": 15041555.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 543,
      "step_time": 19.756406288594007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 134.375,
      "completions/mean_terminated_length": 134.375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3292296230792999,
      "epoch": 0.025196850393700787,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007243999862112105,
      "kl": 0.0012820618867408484,
      "learning_rate": 9.949698934691986e-07,
      "loss": 0.0001,
      "num_tokens": 15066489.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 544,
      "step_time": 15.174732618033886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 131.4375,
      "completions/mean_terminated_length": 131.4375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3490144908428192,
      "epoch": 0.02524316813339509,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008763514342717826,
      "kl": 0.0013646596344187856,
      "learning_rate": 9.9496062992126e-07,
      "loss": 0.0001,
      "num_tokens": 15088240.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 545,
      "step_time": 15.217640075832605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 175.1875,
      "completions/mean_terminated_length": 175.1875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.12979639321565628,
      "epoch": 0.025289485873089393,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023179855197668076,
      "kl": 0.002074635813187342,
      "learning_rate": 9.949513663733209e-07,
      "loss": 0.0001,
      "num_tokens": 15112067.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 546,
      "step_time": 17.45606468990445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 297.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 209.0,
      "completions/mean_terminated_length": 209.0,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.4022025465965271,
      "epoch": 0.025335803612783696,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06028041988611221,
      "kl": 0.0012326524301897734,
      "learning_rate": 9.94942102825382e-07,
      "loss": 0.0407,
      "num_tokens": 15138451.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 547,
      "step_time": 26.755181174725294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 306.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 259.125,
      "completions/mean_terminated_length": 259.125,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "entropy": 0.2725084200501442,
      "epoch": 0.025382121352478,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0626678466796875,
      "kl": 0.0014349300763569772,
      "learning_rate": 9.94932839277443e-07,
      "loss": -0.0022,
      "num_tokens": 15175061.0,
      "reward": 0.9788992404937744,
      "reward_std": 0.005628441926091909,
      "rewards/reward_func/mean": 0.9788992404937744,
      "rewards/reward_func/std": 0.00562844006344676,
      "step": 548,
      "step_time": 28.6032482534647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 164.5625,
      "completions/mean_terminated_length": 164.5625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.39379655569791794,
      "epoch": 0.0254284390921723,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008524972945451736,
      "kl": 0.001492806535679847,
      "learning_rate": 9.949235757295044e-07,
      "loss": 0.0001,
      "num_tokens": 15228350.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 549,
      "step_time": 25.498115357011557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 186.25,
      "completions/mean_terminated_length": 186.25,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.4791732355952263,
      "epoch": 0.025474756831866605,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06944435089826584,
      "kl": 0.0016198160010389984,
      "learning_rate": 9.949143121815656e-07,
      "loss": 0.091,
      "num_tokens": 15251138.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 550,
      "step_time": 22.150676514953375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 345.0,
      "completions/max_terminated_length": 345.0,
      "completions/mean_length": 262.6875,
      "completions/mean_terminated_length": 262.6875,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "entropy": 0.2680952474474907,
      "epoch": 0.025521074571560907,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04636561870574951,
      "kl": 0.0008887053554644808,
      "learning_rate": 9.949050486336267e-07,
      "loss": -0.0425,
      "num_tokens": 15274813.0,
      "reward": 0.8137565851211548,
      "reward_std": 0.2170029729604721,
      "rewards/reward_func/mean": 0.8137565851211548,
      "rewards/reward_func/std": 0.2170029729604721,
      "step": 551,
      "step_time": 29.685348197817802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 128.0625,
      "completions/mean_terminated_length": 128.0625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2995965778827667,
      "epoch": 0.02556739231125521,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009323018603026867,
      "kl": 0.001274977985303849,
      "learning_rate": 9.948957850856878e-07,
      "loss": 0.0001,
      "num_tokens": 15301294.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 552,
      "step_time": 15.274673901498318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 177.1875,
      "completions/mean_terminated_length": 177.1875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.1796923540532589,
      "epoch": 0.025613710050949513,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0792279988527298,
      "kl": 0.0010075290047097951,
      "learning_rate": 9.94886521537749e-07,
      "loss": -0.0282,
      "num_tokens": 15339537.0,
      "reward": 0.8645535707473755,
      "reward_std": 0.19388173520565033,
      "rewards/reward_func/mean": 0.8645535707473755,
      "rewards/reward_func/std": 0.19388172030448914,
      "step": 553,
      "step_time": 22.450017869472504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 177.25,
      "completions/mean_terminated_length": 177.25,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3709481582045555,
      "epoch": 0.025660027790643816,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012022164883092046,
      "kl": 0.0018011342617683113,
      "learning_rate": 9.9487725798981e-07,
      "loss": 0.0001,
      "num_tokens": 15368469.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 554,
      "step_time": 19.368321228772402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 156.25,
      "completions/mean_terminated_length": 156.25,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.3679218143224716,
      "epoch": 0.02570634553033812,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007441905909217894,
      "kl": 0.0014579891285393387,
      "learning_rate": 9.948679944418712e-07,
      "loss": 0.0001,
      "num_tokens": 15390825.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 555,
      "step_time": 16.342542689293623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 138.625,
      "completions/mean_terminated_length": 138.625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.29407284408807755,
      "epoch": 0.025752663270032422,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011675918940454721,
      "kl": 0.0012941797176608816,
      "learning_rate": 9.948587308939323e-07,
      "loss": 0.0001,
      "num_tokens": 15426867.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 556,
      "step_time": 19.005365189164877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 193.625,
      "completions/mean_terminated_length": 193.625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.357125461101532,
      "epoch": 0.025798981009726725,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027116169221699238,
      "kl": 0.0019961106590926647,
      "learning_rate": 9.948494673459934e-07,
      "loss": 0.0001,
      "num_tokens": 15451245.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 557,
      "step_time": 20.6795228458941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 104.375,
      "completions/mean_terminated_length": 104.375,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.23302504047751427,
      "epoch": 0.025845298749421028,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006331351469270885,
      "kl": 0.000980119570158422,
      "learning_rate": 9.948402037980546e-07,
      "loss": 0.0,
      "num_tokens": 15472035.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 558,
      "step_time": 12.356189779937267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 283.0,
      "completions/max_terminated_length": 283.0,
      "completions/mean_length": 178.8125,
      "completions/mean_terminated_length": 178.8125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2559828981757164,
      "epoch": 0.02589161648911533,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005925624864175916,
      "kl": 0.0008678560116095468,
      "learning_rate": 9.948309402501157e-07,
      "loss": 0.0,
      "num_tokens": 15497216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 559,
      "step_time": 24.370823446661234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 157.3125,
      "completions/mean_terminated_length": 157.3125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.16670601069927216,
      "epoch": 0.025937934228809634,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007910909480415285,
      "kl": 0.000819086650153622,
      "learning_rate": 9.948216767021768e-07,
      "loss": 0.0,
      "num_tokens": 15534069.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 560,
      "step_time": 21.16285178437829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 119.1875,
      "completions/mean_terminated_length": 119.1875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.35410934686660767,
      "epoch": 0.025984251968503937,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035102088004350662,
      "kl": 0.0024094951804727316,
      "learning_rate": 9.94812413154238e-07,
      "loss": 0.0001,
      "num_tokens": 15561608.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 561,
      "step_time": 15.391472723335028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 118.4375,
      "completions/mean_terminated_length": 118.4375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.35063062608242035,
      "epoch": 0.02603056970819824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019810523372143507,
      "kl": 0.0023975164513103664,
      "learning_rate": 9.948031496062993e-07,
      "loss": 0.0001,
      "num_tokens": 15597423.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 562,
      "step_time": 17.099945228546858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 171.9375,
      "completions/mean_terminated_length": 171.9375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.21170199662446976,
      "epoch": 0.026076887447892542,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08928564190864563,
      "kl": 0.001233914343174547,
      "learning_rate": 9.947938860583604e-07,
      "loss": -0.0256,
      "num_tokens": 15634654.0,
      "reward": 0.660243809223175,
      "reward_std": 0.1326272338628769,
      "rewards/reward_func/mean": 0.660243809223175,
      "rewards/reward_func/std": 0.1326272338628769,
      "step": 563,
      "step_time": 21.962135393172503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 135.5625,
      "completions/mean_terminated_length": 135.5625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2716743163764477,
      "epoch": 0.026123205187586845,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008933998178690672,
      "kl": 0.0010183459962718189,
      "learning_rate": 9.947846225104215e-07,
      "loss": 0.0001,
      "num_tokens": 15657159.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 564,
      "step_time": 16.57527555525303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 146.25,
      "completions/mean_terminated_length": 146.25,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.39399532228708267,
      "epoch": 0.02616952292728115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012033042730763555,
      "kl": 0.0012665592366829515,
      "learning_rate": 9.947753589624827e-07,
      "loss": 0.0001,
      "num_tokens": 15682171.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 565,
      "step_time": 16.046187974512577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 177.1875,
      "completions/mean_terminated_length": 177.1875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3469313308596611,
      "epoch": 0.02621584066697545,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013318386627361178,
      "kl": 0.001602203119546175,
      "learning_rate": 9.947660954145438e-07,
      "loss": 0.0001,
      "num_tokens": 15714926.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 566,
      "step_time": 22.901175644248724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 388.0,
      "completions/max_terminated_length": 388.0,
      "completions/mean_length": 237.6875,
      "completions/mean_terminated_length": 237.6875,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.4689154475927353,
      "epoch": 0.026262158406669754,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07163114845752716,
      "kl": 0.0014194734394550323,
      "learning_rate": 9.94756831866605e-07,
      "loss": -0.163,
      "num_tokens": 15754969.0,
      "reward": 0.07219797372817993,
      "reward_std": 0.24258717894554138,
      "rewards/reward_func/mean": 0.07219797372817993,
      "rewards/reward_func/std": 0.24258717894554138,
      "step": 567,
      "step_time": 35.50100315362215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 199.1875,
      "completions/mean_terminated_length": 199.1875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4303721934556961,
      "epoch": 0.026308476146364057,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08563347905874252,
      "kl": 0.0017922719707712531,
      "learning_rate": 9.94747568318666e-07,
      "loss": -0.0331,
      "num_tokens": 15786908.0,
      "reward": 0.22827517986297607,
      "reward_std": 0.4083510637283325,
      "rewards/reward_func/mean": 0.22827517986297607,
      "rewards/reward_func/std": 0.4083510637283325,
      "step": 568,
      "step_time": 22.553745798766613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 135.0625,
      "completions/mean_terminated_length": 135.0625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3000262975692749,
      "epoch": 0.02635479388605836,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014759069308638573,
      "kl": 0.0014760128979105502,
      "learning_rate": 9.947383047707272e-07,
      "loss": 0.0001,
      "num_tokens": 15811437.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 569,
      "step_time": 15.757252767682076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 195.375,
      "completions/mean_terminated_length": 195.375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.33487706631422043,
      "epoch": 0.026401111625752663,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000654570001643151,
      "kl": 0.0012129094975534827,
      "learning_rate": 9.947290412227883e-07,
      "loss": 0.0001,
      "num_tokens": 15843091.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 570,
      "step_time": 21.559228021651506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.16004950553178787,
      "epoch": 0.026447429365446966,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.053859077394008636,
      "kl": 0.0007608709565829486,
      "learning_rate": 9.947197776748494e-07,
      "loss": 0.0176,
      "num_tokens": 15871244.0,
      "reward": 0.9782751798629761,
      "reward_std": 0.03886254131793976,
      "rewards/reward_func/mean": 0.9782751798629761,
      "rewards/reward_func/std": 0.03886253759264946,
      "step": 571,
      "step_time": 17.66007414087653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 178.5,
      "completions/mean_terminated_length": 178.5,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.36595743894577026,
      "epoch": 0.02649374710514127,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010515021858736873,
      "kl": 0.0011732606799341738,
      "learning_rate": 9.947105141269105e-07,
      "loss": 0.0001,
      "num_tokens": 15892852.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 572,
      "step_time": 20.06080413982272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 179.9375,
      "completions/mean_terminated_length": 179.9375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.17148252204060555,
      "epoch": 0.02654006484483557,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00039555327384732664,
      "kl": 0.0006037524726707488,
      "learning_rate": 9.947012505789717e-07,
      "loss": 0.0,
      "num_tokens": 15926035.0,
      "reward": 0.9214109182357788,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9214109182357788,
      "rewards/reward_func/std": 0.0,
      "step": 573,
      "step_time": 21.273090057075024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 134.25,
      "completions/mean_terminated_length": 134.25,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.29171208292245865,
      "epoch": 0.026586382584529875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013545382535085082,
      "kl": 0.0012165867956355214,
      "learning_rate": 9.946919870310328e-07,
      "loss": 0.0001,
      "num_tokens": 15954263.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 574,
      "step_time": 17.165605064481497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 113.75,
      "completions/mean_terminated_length": 113.75,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.24007542058825493,
      "epoch": 0.026632700324224177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016633195336908102,
      "kl": 0.0016958365449681878,
      "learning_rate": 9.946827234830941e-07,
      "loss": 0.0001,
      "num_tokens": 15973587.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 575,
      "step_time": 13.618320003151894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 205.625,
      "completions/mean_terminated_length": 205.625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.2145506590604782,
      "epoch": 0.02667901806391848,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006946335779502988,
      "kl": 0.000943855571676977,
      "learning_rate": 9.946734599351552e-07,
      "loss": 0.0,
      "num_tokens": 16007629.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 576,
      "step_time": 23.893839932978153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 225.9375,
      "completions/mean_terminated_length": 225.9375,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.19930892810225487,
      "epoch": 0.026725335803612783,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003200446080882102,
      "kl": 0.0006423304002964869,
      "learning_rate": 9.946641963872162e-07,
      "loss": 0.0,
      "num_tokens": 16038444.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 577,
      "step_time": 24.372179005295038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 158.625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3417799845337868,
      "epoch": 0.026771653543307086,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006592239369638264,
      "kl": 0.0012571831757668406,
      "learning_rate": 9.946549328392773e-07,
      "loss": 0.0001,
      "num_tokens": 16062406.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 578,
      "step_time": 17.421500850468874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 111.0,
      "completions/mean_terminated_length": 111.0,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.29721803963184357,
      "epoch": 0.02681797128300139,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014690555399283767,
      "kl": 0.0014924766728654504,
      "learning_rate": 9.946456692913386e-07,
      "loss": 0.0001,
      "num_tokens": 16082150.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 579,
      "step_time": 12.813174404203892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 126.3125,
      "completions/mean_terminated_length": 126.3125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2668394297361374,
      "epoch": 0.026864289022695692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011392865562811494,
      "kl": 0.0012131782714277506,
      "learning_rate": 9.946364057433997e-07,
      "loss": 0.0001,
      "num_tokens": 16104267.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 580,
      "step_time": 13.761691994965076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 126.6875,
      "completions/mean_terminated_length": 126.6875,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.3839539736509323,
      "epoch": 0.026910606762389995,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009683132520876825,
      "kl": 0.0014040677051525563,
      "learning_rate": 9.946271421954609e-07,
      "loss": 0.0001,
      "num_tokens": 16128806.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 581,
      "step_time": 14.585123918950558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 158.625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3227327987551689,
      "epoch": 0.026956924502084298,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008882497786544263,
      "kl": 0.0011051030887756497,
      "learning_rate": 9.94617878647522e-07,
      "loss": 0.0001,
      "num_tokens": 16151552.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 582,
      "step_time": 16.934905491769314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 298.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 201.1875,
      "completions/mean_terminated_length": 201.1875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4244815856218338,
      "epoch": 0.0270032422417786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009207507828250527,
      "kl": 0.0014205464394763112,
      "learning_rate": 9.946086150995831e-07,
      "loss": 0.0001,
      "num_tokens": 16183907.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 583,
      "step_time": 26.793815910816193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 168.4375,
      "completions/mean_terminated_length": 168.4375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.14582490175962448,
      "epoch": 0.027049559981472904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000684766098856926,
      "kl": 0.0006677401397610083,
      "learning_rate": 9.945993515516442e-07,
      "loss": 0.0,
      "num_tokens": 16219594.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 584,
      "step_time": 21.359646912664175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 197.75,
      "completions/mean_terminated_length": 197.75,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.20227548852562904,
      "epoch": 0.027095877721167207,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04735409840941429,
      "kl": 0.0008678182202856988,
      "learning_rate": 9.945900880037054e-07,
      "loss": 0.0129,
      "num_tokens": 16246934.0,
      "reward": 0.9480330944061279,
      "reward_std": 0.012785443104803562,
      "rewards/reward_func/mean": 0.9480330944061279,
      "rewards/reward_func/std": 0.012785449624061584,
      "step": 585,
      "step_time": 20.081316489726305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 127.375,
      "completions/mean_terminated_length": 127.375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2501451075077057,
      "epoch": 0.02714219546086151,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009449580684304237,
      "kl": 0.0010224088036920875,
      "learning_rate": 9.945808244557665e-07,
      "loss": 0.0001,
      "num_tokens": 16270748.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 586,
      "step_time": 15.71675755828619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 179.125,
      "completions/mean_terminated_length": 179.125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.19648923724889755,
      "epoch": 0.027188513200555812,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13904128968715668,
      "kl": 0.0014718308666488156,
      "learning_rate": 9.945715609078276e-07,
      "loss": -0.0463,
      "num_tokens": 16292158.0,
      "reward": 0.9261720180511475,
      "reward_std": 0.1320675164461136,
      "rewards/reward_func/mean": 0.9261720180511475,
      "rewards/reward_func/std": 0.1320675015449524,
      "step": 587,
      "step_time": 18.391562066972256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.1223986130207777,
      "epoch": 0.027234830940250115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00037939532194286585,
      "kl": 0.0005342196382116526,
      "learning_rate": 9.945622973598887e-07,
      "loss": 0.0,
      "num_tokens": 16314776.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 588,
      "step_time": 15.983282055705786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 174.9375,
      "completions/mean_terminated_length": 174.9375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.41517775505781174,
      "epoch": 0.02728114867994442,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009759520762600005,
      "kl": 0.001477598212659359,
      "learning_rate": 9.945530338119499e-07,
      "loss": 0.0001,
      "num_tokens": 16346375.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 589,
      "step_time": 21.28076909482479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 149.0625,
      "completions/mean_terminated_length": 149.0625,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.34951143711805344,
      "epoch": 0.02732746641963872,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010513507295399904,
      "kl": 0.0013794851256534457,
      "learning_rate": 9.94543770264011e-07,
      "loss": 0.0001,
      "num_tokens": 16367064.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 590,
      "step_time": 17.958604458719492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 129.9375,
      "completions/mean_terminated_length": 129.9375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.29924070835113525,
      "epoch": 0.027373784159333024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008221545722335577,
      "kl": 0.0011459146771812811,
      "learning_rate": 9.945345067160721e-07,
      "loss": 0.0001,
      "num_tokens": 16394343.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 591,
      "step_time": 15.76231737434864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 209.125,
      "completions/mean_terminated_length": 209.125,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.21035076677799225,
      "epoch": 0.027420101899027327,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07679693400859833,
      "kl": 0.0009747492003953084,
      "learning_rate": 9.945252431681334e-07,
      "loss": -0.0168,
      "num_tokens": 16430713.0,
      "reward": 0.907114565372467,
      "reward_std": 0.1997753530740738,
      "rewards/reward_func/mean": 0.907114565372467,
      "rewards/reward_func/std": 0.1997753530740738,
      "step": 592,
      "step_time": 24.519479889422655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 156.625,
      "completions/mean_terminated_length": 156.625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3544777110219002,
      "epoch": 0.02746641963872163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010482023935765028,
      "kl": 0.0015595678996760398,
      "learning_rate": 9.945159796201946e-07,
      "loss": 0.0001,
      "num_tokens": 16485491.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 593,
      "step_time": 24.91678934916854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 131.8125,
      "completions/mean_terminated_length": 131.8125,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.31441253423690796,
      "epoch": 0.027512737378415933,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001301216776482761,
      "kl": 0.00176118579111062,
      "learning_rate": 9.945067160722557e-07,
      "loss": 0.0001,
      "num_tokens": 16513808.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 594,
      "step_time": 17.711910124868155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 142.0625,
      "completions/mean_terminated_length": 142.0625,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.3171621486544609,
      "epoch": 0.027559055118110236,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007246560999192297,
      "kl": 0.0011956649977946654,
      "learning_rate": 9.944974525243166e-07,
      "loss": 0.0001,
      "num_tokens": 16534913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 595,
      "step_time": 16.61210546270013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 127.75,
      "completions/mean_terminated_length": 127.75,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.242061547935009,
      "epoch": 0.02760537285780454,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002108033047989011,
      "kl": 0.0013999624352436513,
      "learning_rate": 9.94488188976378e-07,
      "loss": 0.0001,
      "num_tokens": 16554509.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 596,
      "step_time": 14.333650436252356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 115.4375,
      "completions/mean_terminated_length": 115.4375,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "entropy": 0.31971098482608795,
      "epoch": 0.02765169059749884,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001012361142784357,
      "kl": 0.0013101025542709976,
      "learning_rate": 9.94478925428439e-07,
      "loss": 0.0001,
      "num_tokens": 16575684.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 597,
      "step_time": 13.299234211444855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 209.5,
      "completions/mean_terminated_length": 209.5,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.36262890696525574,
      "epoch": 0.027698008337193145,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0730704814195633,
      "kl": 0.0011368568520992994,
      "learning_rate": 9.944696618805002e-07,
      "loss": -0.0493,
      "num_tokens": 16598876.0,
      "reward": 0.2151769995689392,
      "reward_std": 0.38492029905319214,
      "rewards/reward_func/mean": 0.2151769995689392,
      "rewards/reward_func/std": 0.38492029905319214,
      "step": 598,
      "step_time": 24.221900921314955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 209.5,
      "completions/mean_terminated_length": 209.5,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.19882144778966904,
      "epoch": 0.027744326076887448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00044133211486041546,
      "kl": 0.0006942116451682523,
      "learning_rate": 9.944603983325613e-07,
      "loss": 0.0,
      "num_tokens": 16624212.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 599,
      "step_time": 21.113984003663063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 463.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 282.1875,
      "completions/mean_terminated_length": 282.1875,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.30956175178289413,
      "epoch": 0.02779064381658175,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04635261371731758,
      "kl": 0.0010746768966782838,
      "learning_rate": 9.944511347846224e-07,
      "loss": -0.3788,
      "num_tokens": 16658311.0,
      "reward": 0.1990107297897339,
      "reward_std": 0.3048628866672516,
      "rewards/reward_func/mean": 0.1990107297897339,
      "rewards/reward_func/std": 0.3048628866672516,
      "step": 600,
      "step_time": 39.032493986189365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 160.3125,
      "completions/mean_terminated_length": 160.3125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.32160045206546783,
      "epoch": 0.027836961556276053,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006753444904461503,
      "kl": 0.0012247733538970351,
      "learning_rate": 9.944418712366836e-07,
      "loss": 0.0001,
      "num_tokens": 16679404.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 601,
      "step_time": 17.26866102963686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 221.125,
      "completions/mean_terminated_length": 221.125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.3007632941007614,
      "epoch": 0.027883279295970356,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06767585128545761,
      "kl": 0.0011035270581487566,
      "learning_rate": 9.944326076887447e-07,
      "loss": 0.0107,
      "num_tokens": 16710062.0,
      "reward": 0.5670922994613647,
      "reward_std": 0.4607682228088379,
      "rewards/reward_func/mean": 0.5670922994613647,
      "rewards/reward_func/std": 0.4607682228088379,
      "step": 602,
      "step_time": 25.07387748733163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 190.6875,
      "completions/mean_terminated_length": 190.6875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.30277078598737717,
      "epoch": 0.02792959703566466,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006902308668941259,
      "kl": 0.0012163456704001874,
      "learning_rate": 9.944233441408058e-07,
      "loss": 0.0001,
      "num_tokens": 16741433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 603,
      "step_time": 21.679902721196413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 171.0,
      "completions/mean_terminated_length": 171.0,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.2468840256333351,
      "epoch": 0.027975914775358962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004982223617844284,
      "kl": 0.0009107012447202578,
      "learning_rate": 9.94414080592867e-07,
      "loss": 0.0,
      "num_tokens": 16773801.0,
      "reward": 0.788127601146698,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.788127601146698,
      "rewards/reward_func/std": 0.0,
      "step": 604,
      "step_time": 19.09699758887291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 140.1875,
      "completions/mean_terminated_length": 140.1875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.2988521531224251,
      "epoch": 0.028022232515053265,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016440520994365215,
      "kl": 0.0015206864336505532,
      "learning_rate": 9.944048170449283e-07,
      "loss": 0.0001,
      "num_tokens": 16797276.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 605,
      "step_time": 16.558886874467134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 130.1875,
      "completions/mean_terminated_length": 130.1875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3045179918408394,
      "epoch": 0.028068550254747568,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009281300008296967,
      "kl": 0.0012925256451126188,
      "learning_rate": 9.943955534969894e-07,
      "loss": 0.0001,
      "num_tokens": 16817215.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 606,
      "step_time": 15.212780736386776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 135.1875,
      "completions/mean_terminated_length": 135.1875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.30887145549058914,
      "epoch": 0.02811486799444187,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005796050536446273,
      "kl": 0.0009964118362404406,
      "learning_rate": 9.943862899490505e-07,
      "loss": 0.0001,
      "num_tokens": 16851202.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 607,
      "step_time": 17.76537637040019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 158.0625,
      "completions/mean_terminated_length": 158.0625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.38649002462625504,
      "epoch": 0.028161185734136174,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012031864607706666,
      "kl": 0.0015913881361484528,
      "learning_rate": 9.943770264011114e-07,
      "loss": 0.0001,
      "num_tokens": 16882403.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 608,
      "step_time": 19.019992608577013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 157.5625,
      "completions/mean_terminated_length": 157.5625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.40181419998407364,
      "epoch": 0.028207503473830477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006465300102718174,
      "kl": 0.0013402352924458683,
      "learning_rate": 9.943677628531728e-07,
      "loss": 0.0001,
      "num_tokens": 16927820.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 609,
      "step_time": 22.771573085337877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 149.6875,
      "completions/mean_terminated_length": 149.6875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.26405899599194527,
      "epoch": 0.02825382121352478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001259437995031476,
      "kl": 0.0012018869747407734,
      "learning_rate": 9.94358499305234e-07,
      "loss": 0.0001,
      "num_tokens": 16947975.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 610,
      "step_time": 15.735291086137295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.4246085360646248,
      "epoch": 0.028300138953219083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008513656794093549,
      "kl": 0.0013288070040289313,
      "learning_rate": 9.94349235757295e-07,
      "loss": 0.0001,
      "num_tokens": 16973810.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 611,
      "step_time": 19.875703874975443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3269863650202751,
      "epoch": 0.028346456692913385,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012261215597391129,
      "kl": 0.001385840296279639,
      "learning_rate": 9.943399722093562e-07,
      "loss": 0.0001,
      "num_tokens": 16998506.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 612,
      "step_time": 16.11475994810462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 193.1875,
      "completions/mean_terminated_length": 193.1875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4552323967218399,
      "epoch": 0.02839277443260769,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010839778697118163,
      "kl": 0.0018463747692294419,
      "learning_rate": 9.943307086614173e-07,
      "loss": 0.0001,
      "num_tokens": 17039917.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 613,
      "step_time": 25.193405266851187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 306.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 266.0,
      "completions/mean_terminated_length": 266.0,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.2519208565354347,
      "epoch": 0.02843909217230199,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05388407036662102,
      "kl": 0.001017692542518489,
      "learning_rate": 9.943214451134784e-07,
      "loss": -0.0893,
      "num_tokens": 17073965.0,
      "reward": 0.6615020632743835,
      "reward_std": 0.44653603434562683,
      "rewards/reward_func/mean": 0.6615020632743835,
      "rewards/reward_func/std": 0.44653603434562683,
      "step": 614,
      "step_time": 27.768390368670225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.25032806396484375,
      "epoch": 0.028485409911996294,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009284821571782231,
      "kl": 0.0011010119051206857,
      "learning_rate": 9.943121815655395e-07,
      "loss": 0.0001,
      "num_tokens": 17093478.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 615,
      "step_time": 13.723979100584984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 158.875,
      "completions/mean_terminated_length": 158.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2890133857727051,
      "epoch": 0.028531727651690597,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13864818215370178,
      "kl": 0.0014494710485450923,
      "learning_rate": 9.943029180176007e-07,
      "loss": 0.1458,
      "num_tokens": 17120036.0,
      "reward": 0.7270569801330566,
      "reward_std": 0.36072129011154175,
      "rewards/reward_func/mean": 0.7270569801330566,
      "rewards/reward_func/std": 0.36072129011154175,
      "step": 616,
      "step_time": 21.460193186998367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 201.9375,
      "completions/mean_terminated_length": 201.9375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.19010262191295624,
      "epoch": 0.0285780453913849,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.045914456248283386,
      "kl": 0.0007538544741692021,
      "learning_rate": 9.942936544696618e-07,
      "loss": 0.0197,
      "num_tokens": 17149395.0,
      "reward": 0.9353712797164917,
      "reward_std": 0.01723429374396801,
      "rewards/reward_func/mean": 0.9353712797164917,
      "rewards/reward_func/std": 0.017234310507774353,
      "step": 617,
      "step_time": 22.644675966352224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 116.3125,
      "completions/mean_terminated_length": 116.3125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.28089043498039246,
      "epoch": 0.028624363131079203,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001129591604694724,
      "kl": 0.001515325391665101,
      "learning_rate": 9.94284390921723e-07,
      "loss": 0.0001,
      "num_tokens": 17172552.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 618,
      "step_time": 13.440848540514708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 126.625,
      "completions/mean_terminated_length": 126.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2327614687383175,
      "epoch": 0.028670680870773506,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011413868051022291,
      "kl": 0.0010565890843281522,
      "learning_rate": 9.942751273737842e-07,
      "loss": 0.0001,
      "num_tokens": 17193346.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 619,
      "step_time": 13.750414993613958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 118.0,
      "completions/mean_terminated_length": 118.0,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2940201684832573,
      "epoch": 0.02871699861046781,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022562395315617323,
      "kl": 0.0014221240853657946,
      "learning_rate": 9.942658638258452e-07,
      "loss": 0.0001,
      "num_tokens": 17213282.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 620,
      "step_time": 13.1810467466712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 171.9375,
      "completions/mean_terminated_length": 171.9375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.2548486962914467,
      "epoch": 0.02876331635016211,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09021364897489548,
      "kl": 0.000993866182398051,
      "learning_rate": 9.942566002779063e-07,
      "loss": -0.0402,
      "num_tokens": 17238513.0,
      "reward": 0.23816077411174774,
      "reward_std": 0.02131837047636509,
      "rewards/reward_func/mean": 0.23816077411174774,
      "rewards/reward_func/std": 0.02131836861371994,
      "step": 621,
      "step_time": 18.491280663758516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.24587681889533997,
      "epoch": 0.028809634089856415,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10550187528133392,
      "kl": 0.00158912711776793,
      "learning_rate": 9.942473367299676e-07,
      "loss": 0.0443,
      "num_tokens": 17269882.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 622,
      "step_time": 19.237867150455713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 133.125,
      "completions/mean_terminated_length": 133.125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.1528434455394745,
      "epoch": 0.028855951829550718,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010806182399392128,
      "kl": 0.0008311954879900441,
      "learning_rate": 9.942380731820287e-07,
      "loss": 0.0,
      "num_tokens": 17290028.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 623,
      "step_time": 14.853768266737461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 174.8125,
      "completions/mean_terminated_length": 174.8125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3995269536972046,
      "epoch": 0.02890226956924502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015165697550401092,
      "kl": 0.0018526142812334,
      "learning_rate": 9.942288096340899e-07,
      "loss": 0.0001,
      "num_tokens": 17323161.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 624,
      "step_time": 20.973382882773876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 182.5,
      "completions/mean_terminated_length": 182.5,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.1739337407052517,
      "epoch": 0.028948587308939323,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004962090752087533,
      "kl": 0.0007467141549568623,
      "learning_rate": 9.94219546086151e-07,
      "loss": 0.0,
      "num_tokens": 17360641.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 625,
      "step_time": 21.581327740103006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 173.0,
      "completions/mean_terminated_length": 173.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3666951060295105,
      "epoch": 0.028994905048633626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012159666512161493,
      "kl": 0.0014632449019700289,
      "learning_rate": 9.942102825382121e-07,
      "loss": 0.0001,
      "num_tokens": 17390001.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 626,
      "step_time": 19.581063494086266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 120.125,
      "completions/mean_terminated_length": 120.125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.24443429335951805,
      "epoch": 0.02904122278832793,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002116140676662326,
      "kl": 0.00103072528145276,
      "learning_rate": 9.942010189902732e-07,
      "loss": 0.0001,
      "num_tokens": 17409299.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 627,
      "step_time": 12.728528279811144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 144.6875,
      "completions/mean_terminated_length": 144.6875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.304414801299572,
      "epoch": 0.029087540528022232,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000975096772890538,
      "kl": 0.0013835621648468077,
      "learning_rate": 9.941917554423344e-07,
      "loss": 0.0001,
      "num_tokens": 17440190.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 628,
      "step_time": 17.68225933238864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.0,
      "completions/max_terminated_length": 127.0,
      "completions/mean_length": 115.0,
      "completions/mean_terminated_length": 115.0,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27537262067198753,
      "epoch": 0.029133858267716535,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012910662917420268,
      "kl": 0.0012789879401680082,
      "learning_rate": 9.941824918943955e-07,
      "loss": 0.0001,
      "num_tokens": 17459598.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 629,
      "step_time": 12.648746185004711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 327.0,
      "completions/max_terminated_length": 327.0,
      "completions/mean_length": 222.5625,
      "completions/mean_terminated_length": 222.5625,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.30768710747361183,
      "epoch": 0.029180176007410838,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06626363843679428,
      "kl": 0.0008501510455971584,
      "learning_rate": 9.941732283464566e-07,
      "loss": 0.0411,
      "num_tokens": 17499079.0,
      "reward": 0.05458332970738411,
      "reward_std": 0.03800695016980171,
      "rewards/reward_func/mean": 0.05458332970738411,
      "rewards/reward_func/std": 0.03800695016980171,
      "step": 630,
      "step_time": 31.33593814447522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 262.0625,
      "completions/mean_terminated_length": 262.0625,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.2470867969095707,
      "epoch": 0.02922649374710514,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0614062137901783,
      "kl": 0.0006329155585262924,
      "learning_rate": 9.941639647985177e-07,
      "loss": -0.0538,
      "num_tokens": 17523640.0,
      "reward": 0.7354071140289307,
      "reward_std": 0.28707355260849,
      "rewards/reward_func/mean": 0.7354071140289307,
      "rewards/reward_func/std": 0.2870735824108124,
      "step": 631,
      "step_time": 26.915731094777584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 141.3125,
      "completions/mean_terminated_length": 141.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.21450427547097206,
      "epoch": 0.029272811486799444,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015268087154254317,
      "kl": 0.0008688341185916215,
      "learning_rate": 9.941547012505789e-07,
      "loss": 0.0,
      "num_tokens": 17543437.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 632,
      "step_time": 14.755547858774662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 153.0625,
      "completions/mean_terminated_length": 153.0625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.37368296831846237,
      "epoch": 0.029319129226493747,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001764249405823648,
      "kl": 0.0018268795975018293,
      "learning_rate": 9.9414543770264e-07,
      "loss": 0.0001,
      "num_tokens": 17585598.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 633,
      "step_time": 20.64495661482215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 143.1875,
      "completions/mean_terminated_length": 143.1875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.29835107922554016,
      "epoch": 0.02936544696618805,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006717204814776778,
      "kl": 0.001157870385213755,
      "learning_rate": 9.941361741547011e-07,
      "loss": 0.0001,
      "num_tokens": 17612289.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 634,
      "step_time": 15.97997947409749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 132.9375,
      "completions/mean_terminated_length": 132.9375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.3083609566092491,
      "epoch": 0.029411764705882353,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011125041637569666,
      "kl": 0.0014098618412390351,
      "learning_rate": 9.941269106067625e-07,
      "loss": 0.0001,
      "num_tokens": 17635552.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 635,
      "step_time": 14.866094164550304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 190.75,
      "completions/mean_terminated_length": 190.75,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.22032272443175316,
      "epoch": 0.029458082445576655,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10421456396579742,
      "kl": 0.002787069766782224,
      "learning_rate": 9.941176470588236e-07,
      "loss": -0.033,
      "num_tokens": 17672476.0,
      "reward": 0.6601945161819458,
      "reward_std": 0.1685907542705536,
      "rewards/reward_func/mean": 0.6601945161819458,
      "rewards/reward_func/std": 0.1685907542705536,
      "step": 636,
      "step_time": 22.77554728835821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 237.625,
      "completions/mean_terminated_length": 237.625,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.3203084096312523,
      "epoch": 0.02950440018527096,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0523492656648159,
      "kl": 0.0009472902747802436,
      "learning_rate": 9.941083835108847e-07,
      "loss": -0.0273,
      "num_tokens": 17700726.0,
      "reward": 0.5625,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.5625,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 637,
      "step_time": 27.360778879374266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 189.5625,
      "completions/mean_terminated_length": 189.5625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3144254833459854,
      "epoch": 0.02955071792496526,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012330756289884448,
      "kl": 0.0012759679520968348,
      "learning_rate": 9.940991199629456e-07,
      "loss": 0.0001,
      "num_tokens": 17723903.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 638,
      "step_time": 23.00736243277788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 137.125,
      "completions/mean_terminated_length": 137.125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2956559658050537,
      "epoch": 0.029597035664659564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007645520381629467,
      "kl": 0.001081657683243975,
      "learning_rate": 9.94089856415007e-07,
      "loss": 0.0001,
      "num_tokens": 17745201.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 639,
      "step_time": 15.197411470115185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 182.25,
      "completions/mean_terminated_length": 182.25,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.4124782010912895,
      "epoch": 0.029643353404353867,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08005214482545853,
      "kl": 0.0014793944428674877,
      "learning_rate": 9.94080592867068e-07,
      "loss": 0.0139,
      "num_tokens": 17767669.0,
      "reward": 0.23001110553741455,
      "reward_std": 0.4114563465118408,
      "rewards/reward_func/mean": 0.23001110553741455,
      "rewards/reward_func/std": 0.4114563763141632,
      "step": 640,
      "step_time": 20.014618009328842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 130.5625,
      "completions/mean_terminated_length": 130.5625,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.24497872963547707,
      "epoch": 0.02968967114404817,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001171390525996685,
      "kl": 0.0010745792533271015,
      "learning_rate": 9.940713293191292e-07,
      "loss": 0.0001,
      "num_tokens": 17787262.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 641,
      "step_time": 13.819535158574581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2683763653039932,
      "epoch": 0.029735988883742473,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020587260369211435,
      "kl": 0.001427650888217613,
      "learning_rate": 9.940620657711903e-07,
      "loss": 0.0001,
      "num_tokens": 17823295.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 642,
      "step_time": 18.109300438314676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 157.9375,
      "completions/mean_terminated_length": 157.9375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.14643412828445435,
      "epoch": 0.029782306623436776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005344918463379145,
      "kl": 0.0007510486320825294,
      "learning_rate": 9.940528022232515e-07,
      "loss": 0.0,
      "num_tokens": 17844654.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 643,
      "step_time": 16.412567649036646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 155.125,
      "completions/mean_terminated_length": 155.125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.2538882680237293,
      "epoch": 0.02982862436313108,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007700459682382643,
      "kl": 0.0010706761095207185,
      "learning_rate": 9.940435386753126e-07,
      "loss": 0.0001,
      "num_tokens": 17874240.0,
      "reward": 0.030798785388469696,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.030798785388469696,
      "rewards/reward_func/std": 0.0,
      "step": 644,
      "step_time": 18.515170965343714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 175.0,
      "completions/mean_terminated_length": 175.0,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.404374897480011,
      "epoch": 0.02987494210282538,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007907731342129409,
      "kl": 0.0010905256785918027,
      "learning_rate": 9.940342751273737e-07,
      "loss": 0.0001,
      "num_tokens": 17899104.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 645,
      "step_time": 18.668986041098833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 177.0,
      "completions/mean_terminated_length": 177.0,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.22166218608617783,
      "epoch": 0.029921259842519685,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05660930275917053,
      "kl": 0.0013204931165091693,
      "learning_rate": 9.940250115794348e-07,
      "loss": -0.0253,
      "num_tokens": 17927312.0,
      "reward": 0.29995638132095337,
      "reward_std": 0.16590999066829681,
      "rewards/reward_func/mean": 0.29995638132095337,
      "rewards/reward_func/std": 0.165910005569458,
      "step": 646,
      "step_time": 19.130444202572107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 177.5,
      "completions/mean_terminated_length": 177.5,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.4196074977517128,
      "epoch": 0.029967577582213988,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006432700320146978,
      "kl": 0.0012449279602151364,
      "learning_rate": 9.94015748031496e-07,
      "loss": 0.0001,
      "num_tokens": 17948568.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 647,
      "step_time": 18.71919671073556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 112.6875,
      "completions/mean_terminated_length": 112.6875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.27163299173116684,
      "epoch": 0.03001389532190829,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009595048613846302,
      "kl": 0.0012287074932828546,
      "learning_rate": 9.94006484483557e-07,
      "loss": 0.0001,
      "num_tokens": 17969571.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 648,
      "step_time": 13.595854740589857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 127.375,
      "completions/mean_terminated_length": 127.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2343696802854538,
      "epoch": 0.030060213061602593,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008790047722868621,
      "kl": 0.0011380193172954023,
      "learning_rate": 9.939972209356184e-07,
      "loss": 0.0001,
      "num_tokens": 17989209.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 649,
      "step_time": 13.889198988676071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 120.0625,
      "completions/mean_terminated_length": 120.0625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.3073343336582184,
      "epoch": 0.030106530801296896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010500141652300954,
      "kl": 0.0011955422814935446,
      "learning_rate": 9.939879573876795e-07,
      "loss": 0.0001,
      "num_tokens": 18011130.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 650,
      "step_time": 14.17166980728507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 170.1875,
      "completions/mean_terminated_length": 170.1875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.2249767668545246,
      "epoch": 0.0301528485409912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006380456034094095,
      "kl": 0.0009065577760338783,
      "learning_rate": 9.939786938397405e-07,
      "loss": 0.0,
      "num_tokens": 18033565.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 651,
      "step_time": 18.464233096688986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 205.75,
      "completions/mean_terminated_length": 205.75,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.4476078003644943,
      "epoch": 0.030199166280685502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007847800734452903,
      "kl": 0.0012855999520979822,
      "learning_rate": 9.939694302918018e-07,
      "loss": 0.0001,
      "num_tokens": 18063513.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 652,
      "step_time": 22.482271995395422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 329.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 279.75,
      "completions/mean_terminated_length": 279.75,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "entropy": 0.2994404062628746,
      "epoch": 0.030245484020379805,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05273708701133728,
      "kl": 0.0009134301217272878,
      "learning_rate": 9.93960166743863e-07,
      "loss": -0.0238,
      "num_tokens": 18098741.0,
      "reward": 0.7205860614776611,
      "reward_std": 0.14855854213237762,
      "rewards/reward_func/mean": 0.7205860614776611,
      "rewards/reward_func/std": 0.14855854213237762,
      "step": 653,
      "step_time": 29.735829323530197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 174.75,
      "completions/mean_terminated_length": 174.75,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.26028087735176086,
      "epoch": 0.030291801760074108,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009759777458384633,
      "kl": 0.0010395431017968804,
      "learning_rate": 9.93950903195924e-07,
      "loss": 0.0001,
      "num_tokens": 18120545.0,
      "reward": 0.4723665416240692,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4723665416240692,
      "rewards/reward_func/std": 0.0,
      "step": 654,
      "step_time": 18.53537382557988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 205.1875,
      "completions/mean_terminated_length": 205.1875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.4086238220334053,
      "epoch": 0.03033811949976841,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07738785445690155,
      "kl": 0.0015518628933932632,
      "learning_rate": 9.939416396479852e-07,
      "loss": -0.1105,
      "num_tokens": 18142452.0,
      "reward": 0.18607217073440552,
      "reward_std": 0.3487555980682373,
      "rewards/reward_func/mean": 0.18607217073440552,
      "rewards/reward_func/std": 0.3487556278705597,
      "step": 655,
      "step_time": 24.589965999126434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 172.9375,
      "completions/mean_terminated_length": 172.9375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3716308921575546,
      "epoch": 0.030384437239462714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010996874189004302,
      "kl": 0.0015363656275440007,
      "learning_rate": 9.939323761000463e-07,
      "loss": 0.0001,
      "num_tokens": 18193555.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 656,
      "step_time": 24.94709513708949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 203.9375,
      "completions/mean_terminated_length": 203.9375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.24424386769533157,
      "epoch": 0.030430754979157017,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06499525904655457,
      "kl": 0.0011902508849743754,
      "learning_rate": 9.939231125521074e-07,
      "loss": -0.0378,
      "num_tokens": 18231474.0,
      "reward": 0.8895835876464844,
      "reward_std": 0.23738820850849152,
      "rewards/reward_func/mean": 0.8895835876464844,
      "rewards/reward_func/std": 0.2373882383108139,
      "step": 657,
      "step_time": 23.588997296988964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 133.4375,
      "completions/mean_terminated_length": 133.4375,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.2545120343565941,
      "epoch": 0.03047707271885132,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011487004812806845,
      "kl": 0.0011745925585273653,
      "learning_rate": 9.939138490041685e-07,
      "loss": 0.0001,
      "num_tokens": 18251545.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 658,
      "step_time": 14.423538401722908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 196.6875,
      "completions/mean_terminated_length": 196.6875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.3835623487830162,
      "epoch": 0.030523390458545623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008706080261617899,
      "kl": 0.0012670426804106683,
      "learning_rate": 9.939045854562297e-07,
      "loss": 0.0001,
      "num_tokens": 18279988.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 659,
      "step_time": 23.73527555912733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 200.0625,
      "completions/mean_terminated_length": 200.0625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.32817748188972473,
      "epoch": 0.030569708198239925,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10197916626930237,
      "kl": 0.0013327872147783637,
      "learning_rate": 9.938953219082908e-07,
      "loss": -0.0552,
      "num_tokens": 18305589.0,
      "reward": 0.11260820180177689,
      "reward_std": 0.1164192333817482,
      "rewards/reward_func/mean": 0.11260820180177689,
      "rewards/reward_func/std": 0.1164192333817482,
      "step": 660,
      "step_time": 21.253589272499084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 197.8125,
      "completions/mean_terminated_length": 197.8125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2890971675515175,
      "epoch": 0.03061602593793423,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08252923935651779,
      "kl": 0.001193336385767907,
      "learning_rate": 9.93886058360352e-07,
      "loss": 0.0051,
      "num_tokens": 18335218.0,
      "reward": 0.5322065353393555,
      "reward_std": 0.4849793314933777,
      "rewards/reward_func/mean": 0.5322065353393555,
      "rewards/reward_func/std": 0.4849793612957001,
      "step": 661,
      "step_time": 21.42082080245018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 139.5,
      "completions/mean_terminated_length": 139.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.27542803436517715,
      "epoch": 0.03066234367762853,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006713405600748956,
      "kl": 0.0009908601932693273,
      "learning_rate": 9.938767948124132e-07,
      "loss": 0.0,
      "num_tokens": 18354986.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 662,
      "step_time": 15.230031374841928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 137.5,
      "completions/mean_terminated_length": 137.5,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.33815905451774597,
      "epoch": 0.030708661417322834,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010544717079028487,
      "kl": 0.0012565023789647967,
      "learning_rate": 9.938675312644742e-07,
      "loss": 0.0001,
      "num_tokens": 18391026.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 663,
      "step_time": 19.1298321262002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 186.75,
      "completions/mean_terminated_length": 186.75,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.41814157366752625,
      "epoch": 0.030754979157017137,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07524161785840988,
      "kl": 0.0014231793174985796,
      "learning_rate": 9.938582677165353e-07,
      "loss": -0.0685,
      "num_tokens": 18420798.0,
      "reward": 0.058713316917419434,
      "reward_std": 0.23485326766967773,
      "rewards/reward_func/mean": 0.058713316917419434,
      "rewards/reward_func/std": 0.23485328257083893,
      "step": 664,
      "step_time": 22.819775208830833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 415.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 212.875,
      "completions/mean_terminated_length": 212.875,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.3408391624689102,
      "epoch": 0.03080129689671144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07138629257678986,
      "kl": 0.0011429604492150247,
      "learning_rate": 9.938490041685964e-07,
      "loss": -0.1248,
      "num_tokens": 18444636.0,
      "reward": 0.1933698058128357,
      "reward_std": 0.29622000455856323,
      "rewards/reward_func/mean": 0.1933698058128357,
      "rewards/reward_func/std": 0.29622000455856323,
      "step": 665,
      "step_time": 34.46691955626011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 128.0625,
      "completions/mean_terminated_length": 128.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.21934204548597336,
      "epoch": 0.030847614636405743,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015187819954007864,
      "kl": 0.0013615007337648422,
      "learning_rate": 9.938397406206577e-07,
      "loss": 0.0001,
      "num_tokens": 18464093.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 666,
      "step_time": 13.363538708537817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 150.6875,
      "completions/mean_terminated_length": 150.6875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3489423394203186,
      "epoch": 0.030893932376100046,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019261379493400455,
      "kl": 0.0015275503683369607,
      "learning_rate": 9.938304770727189e-07,
      "loss": 0.0001,
      "num_tokens": 18489144.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 667,
      "step_time": 17.13107032701373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 190.0,
      "completions/mean_terminated_length": 190.0,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.3804415836930275,
      "epoch": 0.03094025011579435,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003983758215326816,
      "kl": 0.0010572924802545458,
      "learning_rate": 9.9382121352478e-07,
      "loss": 0.0001,
      "num_tokens": 18509896.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 668,
      "step_time": 21.60600521788001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 157.0,
      "completions/mean_terminated_length": 157.0,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3257174715399742,
      "epoch": 0.03098656785548865,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11692310869693756,
      "kl": 0.001714501326205209,
      "learning_rate": 9.938119499768411e-07,
      "loss": -0.085,
      "num_tokens": 18531288.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 669,
      "step_time": 18.257702708244324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.30243848264217377,
      "epoch": 0.031032885595182955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011854803888127208,
      "kl": 0.0012532831751741469,
      "learning_rate": 9.938026864289022e-07,
      "loss": 0.0001,
      "num_tokens": 18555542.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 670,
      "step_time": 19.74624053761363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 187.1875,
      "completions/mean_terminated_length": 187.1875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.4084184989333153,
      "epoch": 0.031079203334877258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010090351570397615,
      "kl": 0.0015629433910362422,
      "learning_rate": 9.937934228809634e-07,
      "loss": 0.0001,
      "num_tokens": 18579513.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 671,
      "step_time": 21.323964346200228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 118.8125,
      "completions/mean_terminated_length": 118.8125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.26723024621605873,
      "epoch": 0.03112552107457156,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001027853460982442,
      "kl": 0.0011152219667565078,
      "learning_rate": 9.937841593330245e-07,
      "loss": 0.0001,
      "num_tokens": 18600070.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 672,
      "step_time": 13.687433570623398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 291.0,
      "completions/max_terminated_length": 291.0,
      "completions/mean_length": 195.6875,
      "completions/mean_terminated_length": 195.6875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.30489207804203033,
      "epoch": 0.031171838814265863,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07319233566522598,
      "kl": 0.001412037032423541,
      "learning_rate": 9.937748957850856e-07,
      "loss": 0.1205,
      "num_tokens": 18623745.0,
      "reward": 0.03309401869773865,
      "reward_std": 0.01641923189163208,
      "rewards/reward_func/mean": 0.03309401869773865,
      "rewards/reward_func/std": 0.01641923189163208,
      "step": 673,
      "step_time": 24.96692108362913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 138.5,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.2258298024535179,
      "epoch": 0.031218156553960166,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000795226835180074,
      "kl": 0.0010459717304911464,
      "learning_rate": 9.937656322371467e-07,
      "loss": 0.0001,
      "num_tokens": 18646553.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 674,
      "step_time": 15.537427980452776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 153.1875,
      "completions/mean_terminated_length": 153.1875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3664705604314804,
      "epoch": 0.03126447429365447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009839312406256795,
      "kl": 0.001658998487982899,
      "learning_rate": 9.937563686892079e-07,
      "loss": 0.0001,
      "num_tokens": 18667644.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 675,
      "step_time": 16.918759364634752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 124.0625,
      "completions/mean_terminated_length": 124.0625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.25809136033058167,
      "epoch": 0.03131079203334877,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021402640268206596,
      "kl": 0.0013725956960115582,
      "learning_rate": 9.93747105141269e-07,
      "loss": 0.0001,
      "num_tokens": 18690877.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 676,
      "step_time": 14.425536841154099
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.34704677760601044,
      "epoch": 0.031357109773043075,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020199345890432596,
      "kl": 0.0015883515297900885,
      "learning_rate": 9.937378415933301e-07,
      "loss": 0.0001,
      "num_tokens": 18720398.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 677,
      "step_time": 17.222666319459677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 121.25,
      "completions/mean_terminated_length": 121.25,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.28825974464416504,
      "epoch": 0.03140342751273738,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007279080455191433,
      "kl": 0.0011206761992070824,
      "learning_rate": 9.937285780453912e-07,
      "loss": 0.0001,
      "num_tokens": 18744434.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 678,
      "step_time": 14.141832951456308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 174.9375,
      "completions/mean_terminated_length": 174.9375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3730523809790611,
      "epoch": 0.03144974525243168,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006375746452249587,
      "kl": 0.0011505739093990996,
      "learning_rate": 9.937193144974526e-07,
      "loss": 0.0001,
      "num_tokens": 18772705.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 679,
      "step_time": 22.287692293524742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 159.3125,
      "completions/mean_terminated_length": 159.3125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3158295676112175,
      "epoch": 0.031496062992125984,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012767198495566845,
      "kl": 0.0013278848491609097,
      "learning_rate": 9.937100509495137e-07,
      "loss": 0.0001,
      "num_tokens": 18800246.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 680,
      "step_time": 17.864353463053703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 136.25,
      "completions/mean_terminated_length": 136.25,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.35829856991767883,
      "epoch": 0.03154238073182029,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012588640674948692,
      "kl": 0.0017853121389634907,
      "learning_rate": 9.937007874015746e-07,
      "loss": 0.0001,
      "num_tokens": 18855642.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 681,
      "step_time": 23.69368052110076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 112.125,
      "completions/mean_terminated_length": 112.125,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "entropy": 0.24923455342650414,
      "epoch": 0.03158869847151459,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001384776784107089,
      "kl": 0.0011787291732616723,
      "learning_rate": 9.93691523853636e-07,
      "loss": 0.0001,
      "num_tokens": 18875004.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 682,
      "step_time": 12.379692498594522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 155.75,
      "completions/mean_terminated_length": 155.75,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4157654121518135,
      "epoch": 0.03163501621120889,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006648803828284144,
      "kl": 0.0014104731380939484,
      "learning_rate": 9.93682260305697e-07,
      "loss": 0.0001,
      "num_tokens": 18909208.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 683,
      "step_time": 19.277349393814802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 139.25,
      "completions/mean_terminated_length": 139.25,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.20814481377601624,
      "epoch": 0.031681333950903195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008208968793042004,
      "kl": 0.0009661391522968188,
      "learning_rate": 9.936729967577582e-07,
      "loss": 0.0,
      "num_tokens": 18939628.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 684,
      "step_time": 17.76503198221326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 144.3125,
      "completions/mean_terminated_length": 144.3125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.40677928179502487,
      "epoch": 0.0317276516905975,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007508598500862718,
      "kl": 0.0013949218846391886,
      "learning_rate": 9.936637332098193e-07,
      "loss": 0.0001,
      "num_tokens": 18984465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 685,
      "step_time": 20.997331146150827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 137.9375,
      "completions/mean_terminated_length": 137.9375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2835551127791405,
      "epoch": 0.0317739694302918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018957066349685192,
      "kl": 0.001669476885581389,
      "learning_rate": 9.936544696618805e-07,
      "loss": 0.0001,
      "num_tokens": 19005168.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 686,
      "step_time": 15.702382504940033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 111.1875,
      "completions/mean_terminated_length": 111.1875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.24426046386361122,
      "epoch": 0.031820287169986104,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001307345344685018,
      "kl": 0.0013860033504897729,
      "learning_rate": 9.936452061139416e-07,
      "loss": 0.0001,
      "num_tokens": 19025619.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 687,
      "step_time": 13.51840564981103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 191.0625,
      "completions/mean_terminated_length": 191.0625,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.3493984639644623,
      "epoch": 0.03186660490968041,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10677601397037506,
      "kl": 0.0011691860418068245,
      "learning_rate": 9.936359425660027e-07,
      "loss": 0.0002,
      "num_tokens": 19053924.0,
      "reward": 0.01930723339319229,
      "reward_std": 0.03173091635107994,
      "rewards/reward_func/mean": 0.01930723339319229,
      "rewards/reward_func/std": 0.03173091635107994,
      "step": 688,
      "step_time": 22.76512398570776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 122.625,
      "completions/mean_terminated_length": 122.625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.25590357929468155,
      "epoch": 0.03191292264937471,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011609547073021531,
      "kl": 0.000935665360884741,
      "learning_rate": 9.936266790180638e-07,
      "loss": 0.0,
      "num_tokens": 19075662.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 689,
      "step_time": 14.06349989399314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 133.9375,
      "completions/mean_terminated_length": 133.9375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3027240112423897,
      "epoch": 0.03195924038906901,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005010907188989222,
      "kl": 0.0008855178894009441,
      "learning_rate": 9.93617415470125e-07,
      "loss": 0.0,
      "num_tokens": 19103821.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 690,
      "step_time": 17.320686750113964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 141.125,
      "completions/mean_terminated_length": 141.125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2675025314092636,
      "epoch": 0.032005558128763316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001415303093381226,
      "kl": 0.0014167480112519115,
      "learning_rate": 9.93608151922186e-07,
      "loss": 0.0001,
      "num_tokens": 19123599.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 691,
      "step_time": 16.920801613479853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 136.4375,
      "completions/mean_terminated_length": 136.4375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3726763650774956,
      "epoch": 0.03205187586845762,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016297450056299567,
      "kl": 0.0019747884798562154,
      "learning_rate": 9.935988883742474e-07,
      "loss": 0.0001,
      "num_tokens": 19161190.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 692,
      "step_time": 18.093306742608547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 153.5625,
      "completions/mean_terminated_length": 153.5625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.436239056289196,
      "epoch": 0.03209819360815192,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001057538203895092,
      "kl": 0.0018068332865368575,
      "learning_rate": 9.935896248263085e-07,
      "loss": 0.0001,
      "num_tokens": 19210543.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 693,
      "step_time": 22.804465100169182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 184.0,
      "completions/mean_terminated_length": 184.0,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.36290524154901505,
      "epoch": 0.032144511347846225,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07543493807315826,
      "kl": 0.001298435847274959,
      "learning_rate": 9.935803612783695e-07,
      "loss": 0.0973,
      "num_tokens": 19237647.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 694,
      "step_time": 22.445793222635984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 183.625,
      "completions/mean_terminated_length": 183.625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3356466591358185,
      "epoch": 0.03219082908754053,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09124850481748581,
      "kl": 0.0016333387466147542,
      "learning_rate": 9.935710977304306e-07,
      "loss": 0.0035,
      "num_tokens": 19262985.0,
      "reward": 0.058969270437955856,
      "reward_std": 0.10548744350671768,
      "rewards/reward_func/mean": 0.058969270437955856,
      "rewards/reward_func/std": 0.10548743605613708,
      "step": 695,
      "step_time": 23.154754973948002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 227.125,
      "completions/mean_terminated_length": 227.125,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.20573032274842262,
      "epoch": 0.03223714682723483,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06313058733940125,
      "kl": 0.0008511656487826258,
      "learning_rate": 9.93561834182492e-07,
      "loss": 0.0208,
      "num_tokens": 19299803.0,
      "reward": 0.9785919189453125,
      "reward_std": 0.08563227206468582,
      "rewards/reward_func/mean": 0.9785919189453125,
      "rewards/reward_func/std": 0.08563227951526642,
      "step": 696,
      "step_time": 24.87244164943695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 128.6875,
      "completions/mean_terminated_length": 128.6875,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.350361131131649,
      "epoch": 0.03228346456692913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009798979153856635,
      "kl": 0.00100500434928108,
      "learning_rate": 9.93552570634553e-07,
      "loss": 0.0001,
      "num_tokens": 19324214.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 697,
      "step_time": 15.373975336551666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 122.875,
      "completions/mean_terminated_length": 122.875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.29454803466796875,
      "epoch": 0.032329782306623436,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001718203304335475,
      "kl": 0.0014642720634583384,
      "learning_rate": 9.935433070866142e-07,
      "loss": 0.0001,
      "num_tokens": 19345044.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 698,
      "step_time": 13.65459056571126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 160.1875,
      "completions/mean_terminated_length": 160.1875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3443695157766342,
      "epoch": 0.03237610004631774,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009369025938212872,
      "kl": 0.0011791688884841278,
      "learning_rate": 9.935340435386753e-07,
      "loss": 0.0001,
      "num_tokens": 19366391.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 699,
      "step_time": 16.102231845259666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 147.125,
      "completions/mean_terminated_length": 147.125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.21535206213593483,
      "epoch": 0.03242241778601204,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00042517686961218715,
      "kl": 0.0007701033173361793,
      "learning_rate": 9.935247799907364e-07,
      "loss": 0.0,
      "num_tokens": 19387657.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 700,
      "step_time": 16.184305012226105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 284.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 197.125,
      "completions/mean_terminated_length": 197.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.34540799260139465,
      "epoch": 0.032468735525706345,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07063841819763184,
      "kl": 0.0010681675630621612,
      "learning_rate": 9.935155164427975e-07,
      "loss": -0.0456,
      "num_tokens": 19409131.0,
      "reward": 0.7472657561302185,
      "reward_std": 0.371590256690979,
      "rewards/reward_func/mean": 0.7472657561302185,
      "rewards/reward_func/std": 0.3715902864933014,
      "step": 701,
      "step_time": 24.055828519165516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 136.25,
      "completions/mean_terminated_length": 136.25,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2532971352338791,
      "epoch": 0.03251505326540065,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007650414481759071,
      "kl": 0.0010268701007589698,
      "learning_rate": 9.935062528948587e-07,
      "loss": 0.0001,
      "num_tokens": 19433935.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 702,
      "step_time": 15.407848794013262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 187.625,
      "completions/mean_terminated_length": 187.625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.45612356066703796,
      "epoch": 0.03256137100509495,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006161848432384431,
      "kl": 0.0013176907668821514,
      "learning_rate": 9.934969893469198e-07,
      "loss": 0.0001,
      "num_tokens": 19459193.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 703,
      "step_time": 20.25391785427928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 224.4375,
      "completions/mean_terminated_length": 224.4375,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.2529004104435444,
      "epoch": 0.032607688744789254,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05345242843031883,
      "kl": 0.0006900770240463316,
      "learning_rate": 9.93487725798981e-07,
      "loss": -0.0405,
      "num_tokens": 19493248.0,
      "reward": 0.9401124715805054,
      "reward_std": 0.23955021798610687,
      "rewards/reward_func/mean": 0.9401124715805054,
      "rewards/reward_func/std": 0.23955021798610687,
      "step": 704,
      "step_time": 23.980402942746878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 195.75,
      "completions/mean_terminated_length": 195.75,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.424755334854126,
      "epoch": 0.03265400648448356,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000872373057063669,
      "kl": 0.0015883336891420186,
      "learning_rate": 9.934784622510423e-07,
      "loss": 0.0001,
      "num_tokens": 19516060.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 705,
      "step_time": 22.41007725521922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 336.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 269.75,
      "completions/mean_terminated_length": 269.75,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "entropy": 0.23423856869339943,
      "epoch": 0.03270032422417786,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05221366882324219,
      "kl": 0.0008720298501430079,
      "learning_rate": 9.934691987031032e-07,
      "loss": -0.0714,
      "num_tokens": 19544920.0,
      "reward": 0.9230508804321289,
      "reward_std": 0.13765078783035278,
      "rewards/reward_func/mean": 0.9230508804321289,
      "rewards/reward_func/std": 0.13765080273151398,
      "step": 706,
      "step_time": 28.587394293397665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 139.0625,
      "completions/mean_terminated_length": 139.0625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.28025446459650993,
      "epoch": 0.03274664196387216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015409340849146247,
      "kl": 0.0013668817409779876,
      "learning_rate": 9.934599351551643e-07,
      "loss": 0.0001,
      "num_tokens": 19573241.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 707,
      "step_time": 18.28684702515602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 156.3125,
      "completions/mean_terminated_length": 156.3125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2281733974814415,
      "epoch": 0.032792959703566466,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09042426943778992,
      "kl": 0.0009373894135933369,
      "learning_rate": 9.934506716072254e-07,
      "loss": 0.011,
      "num_tokens": 19593982.0,
      "reward": 0.4834691882133484,
      "reward_std": 0.19668100774288177,
      "rewards/reward_func/mean": 0.4834691882133484,
      "rewards/reward_func/std": 0.19668102264404297,
      "step": 708,
      "step_time": 18.26748325675726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 184.625,
      "completions/mean_terminated_length": 184.625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2592911906540394,
      "epoch": 0.03283927744326077,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005293122376315296,
      "kl": 0.0008192450914066285,
      "learning_rate": 9.934414080592868e-07,
      "loss": 0.0,
      "num_tokens": 19621368.0,
      "reward": 0.7788007855415344,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7788007855415344,
      "rewards/reward_func/std": 0.0,
      "step": 709,
      "step_time": 22.77542806044221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 193.25,
      "completions/mean_terminated_length": 193.25,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.4455046057701111,
      "epoch": 0.03288559518295507,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007364381453953683,
      "kl": 0.0015260854561347514,
      "learning_rate": 9.934321445113479e-07,
      "loss": 0.0001,
      "num_tokens": 19676332.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 710,
      "step_time": 28.35394797474146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 159.0625,
      "completions/mean_terminated_length": 159.0625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3817960396409035,
      "epoch": 0.032931912922649374,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005524643347598612,
      "kl": 0.001286326936678961,
      "learning_rate": 9.93422880963409e-07,
      "loss": 0.0001,
      "num_tokens": 19708141.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 711,
      "step_time": 19.659467611461878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 171.1875,
      "completions/mean_terminated_length": 171.1875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.21849926188588142,
      "epoch": 0.03297823066234368,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18764466047286987,
      "kl": 0.001313518121605739,
      "learning_rate": 9.934136174154701e-07,
      "loss": -0.0318,
      "num_tokens": 19729360.0,
      "reward": 0.2936846613883972,
      "reward_std": 0.18835076689720154,
      "rewards/reward_func/mean": 0.2936846613883972,
      "rewards/reward_func/std": 0.18835076689720154,
      "step": 712,
      "step_time": 18.324943736195564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 137.5625,
      "completions/mean_terminated_length": 137.5625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.24218040704727173,
      "epoch": 0.03302454840203798,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00061118631856516,
      "kl": 0.0008218033908633515,
      "learning_rate": 9.934043538675313e-07,
      "loss": 0.0,
      "num_tokens": 19753625.0,
      "reward": 0.35782673954963684,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.35782673954963684,
      "rewards/reward_func/std": 0.0,
      "step": 713,
      "step_time": 16.36056460440159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 131.75,
      "completions/mean_terminated_length": 131.75,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2659558728337288,
      "epoch": 0.03307086614173228,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010865560034289956,
      "kl": 0.0011735920270439237,
      "learning_rate": 9.933950903195924e-07,
      "loss": 0.0001,
      "num_tokens": 19774773.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 714,
      "step_time": 15.701992142945528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 224.625,
      "completions/mean_terminated_length": 224.625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3677009716629982,
      "epoch": 0.033117183881426586,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05988532304763794,
      "kl": 0.0014401644875761122,
      "learning_rate": 9.933858267716535e-07,
      "loss": -0.0327,
      "num_tokens": 19797503.0,
      "reward": 0.6246673464775085,
      "reward_std": 0.461967796087265,
      "rewards/reward_func/mean": 0.6246673464775085,
      "rewards/reward_func/std": 0.4619678258895874,
      "step": 715,
      "step_time": 26.359856016933918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 137.1875,
      "completions/mean_terminated_length": 137.1875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3047793358564377,
      "epoch": 0.03316350162112089,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002679202938452363,
      "kl": 0.0018670128483790904,
      "learning_rate": 9.933765632237146e-07,
      "loss": 0.0001,
      "num_tokens": 19828530.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 716,
      "step_time": 17.258633948862553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 133.75,
      "completions/mean_terminated_length": 133.75,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.262413639575243,
      "epoch": 0.03320981936081519,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006397275719791651,
      "kl": 0.001206342043587938,
      "learning_rate": 9.933672996757758e-07,
      "loss": 0.0001,
      "num_tokens": 19849502.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 717,
      "step_time": 14.940201926976442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 194.3125,
      "completions/mean_terminated_length": 194.3125,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.2365681529045105,
      "epoch": 0.033256137100509495,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0802539587020874,
      "kl": 0.0008338545303558931,
      "learning_rate": 9.933580361278369e-07,
      "loss": -0.0055,
      "num_tokens": 19887619.0,
      "reward": 0.9095170497894287,
      "reward_std": 0.24253785610198975,
      "rewards/reward_func/mean": 0.9095170497894287,
      "rewards/reward_func/std": 0.24253787100315094,
      "step": 718,
      "step_time": 22.915946260094643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 165.3125,
      "completions/mean_terminated_length": 165.3125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.31308742612600327,
      "epoch": 0.0333024548402038,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07160071283578873,
      "kl": 0.0009658441704232246,
      "learning_rate": 9.93348772579898e-07,
      "loss": -0.0388,
      "num_tokens": 19908424.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 719,
      "step_time": 17.067168951034546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 172.1875,
      "completions/mean_terminated_length": 172.1875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3278944119811058,
      "epoch": 0.0333487725798981,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012633440783247352,
      "kl": 0.0014585844764951617,
      "learning_rate": 9.933395090319591e-07,
      "loss": 0.0001,
      "num_tokens": 19945291.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 720,
      "step_time": 20.91337340325117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 298.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 206.125,
      "completions/mean_terminated_length": 206.125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3783861994743347,
      "epoch": 0.0333950903195924,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07267819344997406,
      "kl": 0.001166014961199835,
      "learning_rate": 9.933302454840203e-07,
      "loss": -0.1424,
      "num_tokens": 19966749.0,
      "reward": 0.4347226917743683,
      "reward_std": 0.5092058777809143,
      "rewards/reward_func/mean": 0.4347226917743683,
      "rewards/reward_func/std": 0.5092059373855591,
      "step": 721,
      "step_time": 24.837207660079002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 120.75,
      "completions/mean_terminated_length": 120.75,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.270227313041687,
      "epoch": 0.033441408059286706,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00099515775218606,
      "kl": 0.0011917454103240743,
      "learning_rate": 9.933209819360816e-07,
      "loss": 0.0001,
      "num_tokens": 19989625.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 722,
      "step_time": 16.01493902504444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 183.0,
      "completions/mean_terminated_length": 183.0,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3660736382007599,
      "epoch": 0.03348772579898101,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013438266469165683,
      "kl": 0.0015953602851368487,
      "learning_rate": 9.933117183881427e-07,
      "loss": 0.0001,
      "num_tokens": 20012057.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 723,
      "step_time": 18.63400572165847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 162.3125,
      "completions/mean_terminated_length": 162.3125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.34091276675462723,
      "epoch": 0.03353404353867531,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006663731182925403,
      "kl": 0.0012835308152716607,
      "learning_rate": 9.933024548402038e-07,
      "loss": 0.0001,
      "num_tokens": 20039854.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 724,
      "step_time": 19.417426977306604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 160.3125,
      "completions/mean_terminated_length": 160.3125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.40660395473241806,
      "epoch": 0.033580361278369615,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007975733024068177,
      "kl": 0.0016166368732228875,
      "learning_rate": 9.932931912922648e-07,
      "loss": 0.0001,
      "num_tokens": 20097651.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 725,
      "step_time": 26.35659772530198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 163.625,
      "completions/mean_terminated_length": 163.625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3349655866622925,
      "epoch": 0.03362667901806392,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015858388505876064,
      "kl": 0.0015882920124568045,
      "learning_rate": 9.93283927744326e-07,
      "loss": 0.0001,
      "num_tokens": 20146989.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 726,
      "step_time": 23.59270754829049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 174.25,
      "completions/mean_terminated_length": 174.25,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.37874750047922134,
      "epoch": 0.03367299675775822,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009076311835087836,
      "kl": 0.0014788406842853874,
      "learning_rate": 9.932746641963872e-07,
      "loss": 0.0001,
      "num_tokens": 20182705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 727,
      "step_time": 22.591049123555422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 186.8125,
      "completions/mean_terminated_length": 186.8125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.2066696584224701,
      "epoch": 0.033719314497452524,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08451955765485764,
      "kl": 0.0009963087213691324,
      "learning_rate": 9.932654006484483e-07,
      "loss": 0.0859,
      "num_tokens": 20216014.0,
      "reward": 0.5573630332946777,
      "reward_std": 0.14863014221191406,
      "rewards/reward_func/mean": 0.5573630332946777,
      "rewards/reward_func/std": 0.14863014221191406,
      "step": 728,
      "step_time": 23.8648879006505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 137.1875,
      "completions/mean_terminated_length": 137.1875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3334541544318199,
      "epoch": 0.03376563223714683,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012491237139329314,
      "kl": 0.0016305186436511576,
      "learning_rate": 9.932561371005095e-07,
      "loss": 0.0001,
      "num_tokens": 20251985.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 729,
      "step_time": 18.136456787586212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 130.75,
      "completions/mean_terminated_length": 130.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.29052938520908356,
      "epoch": 0.03381194997684113,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001193807809613645,
      "kl": 0.0013317964621819556,
      "learning_rate": 9.932468735525706e-07,
      "loss": 0.0001,
      "num_tokens": 20272509.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 730,
      "step_time": 14.66816596314311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 157.625,
      "completions/mean_terminated_length": 157.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3935469910502434,
      "epoch": 0.03385826771653543,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006336777005344629,
      "kl": 0.0011270699906162918,
      "learning_rate": 9.932376100046317e-07,
      "loss": 0.0001,
      "num_tokens": 20299671.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 731,
      "step_time": 17.40089276432991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 164.375,
      "completions/mean_terminated_length": 164.375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3299044221639633,
      "epoch": 0.033904585456229736,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006873649545013905,
      "kl": 0.001305067795328796,
      "learning_rate": 9.932283464566928e-07,
      "loss": 0.0001,
      "num_tokens": 20322429.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 732,
      "step_time": 18.563721273094416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 186.6875,
      "completions/mean_terminated_length": 186.6875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.371376670897007,
      "epoch": 0.03395090319592404,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08148089051246643,
      "kl": 0.0010322736197849736,
      "learning_rate": 9.93219082908754e-07,
      "loss": -0.038,
      "num_tokens": 20342808.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 733,
      "step_time": 21.671961937099695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 131.5,
      "completions/mean_terminated_length": 131.5,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.22905239090323448,
      "epoch": 0.03399722093561834,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014935294166207314,
      "kl": 0.001308032893575728,
      "learning_rate": 9.93209819360815e-07,
      "loss": 0.0001,
      "num_tokens": 20362368.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 734,
      "step_time": 13.496449582278728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 198.875,
      "completions/mean_terminated_length": 198.875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.2636433206498623,
      "epoch": 0.034043538675312644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005140812718309462,
      "kl": 0.0007544051186414436,
      "learning_rate": 9.932005558128762e-07,
      "loss": 0.0,
      "num_tokens": 20399006.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 735,
      "step_time": 22.731944765895605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 208.625,
      "completions/mean_terminated_length": 208.625,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.16582611203193665,
      "epoch": 0.03408985641500695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007200704421848059,
      "kl": 0.0008659442391945049,
      "learning_rate": 9.931912922649375e-07,
      "loss": 0.0,
      "num_tokens": 20423320.0,
      "reward": 0.46831193566322327,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.46831193566322327,
      "rewards/reward_func/std": 0.0,
      "step": 736,
      "step_time": 20.08252888917923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 164.25,
      "completions/mean_terminated_length": 164.25,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3942590281367302,
      "epoch": 0.03413617415470125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002515591913834214,
      "kl": 0.002140891447197646,
      "learning_rate": 9.931820287169985e-07,
      "loss": 0.0001,
      "num_tokens": 20453596.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 737,
      "step_time": 20.763261321932077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 181.875,
      "completions/mean_terminated_length": 181.875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3159395232796669,
      "epoch": 0.03418249189439555,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1018763855099678,
      "kl": 0.002017256512772292,
      "learning_rate": 9.931727651690596e-07,
      "loss": -0.0771,
      "num_tokens": 20478202.0,
      "reward": 0.3125,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.3125,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 738,
      "step_time": 19.832482635974884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 118.375,
      "completions/mean_terminated_length": 118.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.23701602593064308,
      "epoch": 0.034228809634089856,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008975856471806765,
      "kl": 0.0010572972532827407,
      "learning_rate": 9.93163501621121e-07,
      "loss": 0.0001,
      "num_tokens": 20498528.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 739,
      "step_time": 13.707400850951672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 196.8125,
      "completions/mean_terminated_length": 196.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3240229934453964,
      "epoch": 0.03427512737378416,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07890082150697708,
      "kl": 0.0016898235189728439,
      "learning_rate": 9.93154238073182e-07,
      "loss": -0.0403,
      "num_tokens": 20521229.0,
      "reward": 0.3125,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.3125,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 740,
      "step_time": 22.117990609258413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 152.5625,
      "completions/mean_terminated_length": 152.5625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.36976949870586395,
      "epoch": 0.03432144511347846,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006292248726822436,
      "kl": 0.001158057653810829,
      "learning_rate": 9.931449745252432e-07,
      "loss": 0.0001,
      "num_tokens": 20552102.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 741,
      "step_time": 18.70715820044279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 321.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 227.375,
      "completions/mean_terminated_length": 227.375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.3275139331817627,
      "epoch": 0.034367762853172765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06887755542993546,
      "kl": 0.001257510157302022,
      "learning_rate": 9.931357109773043e-07,
      "loss": -0.2157,
      "num_tokens": 20579036.0,
      "reward": 0.2892181873321533,
      "reward_std": 0.4444383680820465,
      "rewards/reward_func/mean": 0.2892181873321533,
      "rewards/reward_func/std": 0.4444383978843689,
      "step": 742,
      "step_time": 27.33681583032012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 184.1875,
      "completions/mean_terminated_length": 184.1875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.4183108061552048,
      "epoch": 0.03441408059286707,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011316396994516253,
      "kl": 0.001560896256705746,
      "learning_rate": 9.931264474293654e-07,
      "loss": 0.0001,
      "num_tokens": 20608591.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 743,
      "step_time": 22.03494720160961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 189.1875,
      "completions/mean_terminated_length": 189.1875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.3440265357494354,
      "epoch": 0.03446039833256137,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11852071434259415,
      "kl": 0.0022669222380500287,
      "learning_rate": 9.931171838814265e-07,
      "loss": -0.0199,
      "num_tokens": 20633218.0,
      "reward": 0.8482850790023804,
      "reward_std": 0.2262093424797058,
      "rewards/reward_func/mean": 0.8482850790023804,
      "rewards/reward_func/std": 0.226209357380867,
      "step": 744,
      "step_time": 24.622589204460382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 306.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 223.8125,
      "completions/mean_terminated_length": 223.8125,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.3200981020927429,
      "epoch": 0.03450671607225567,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05692924186587334,
      "kl": 0.0008040217508096248,
      "learning_rate": 9.931079203334877e-07,
      "loss": -0.0969,
      "num_tokens": 20659615.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 745,
      "step_time": 26.291905768215656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 164.3125,
      "completions/mean_terminated_length": 164.3125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.1850629523396492,
      "epoch": 0.034553033811949976,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07412910461425781,
      "kl": 0.0010273780062561855,
      "learning_rate": 9.930986567855488e-07,
      "loss": -0.0354,
      "num_tokens": 20681252.0,
      "reward": 0.8714442253112793,
      "reward_std": 0.1666259467601776,
      "rewards/reward_func/mean": 0.8714442253112793,
      "rewards/reward_func/std": 0.1666259467601776,
      "step": 746,
      "step_time": 16.422810439020395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 152.5,
      "completions/mean_terminated_length": 152.5,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.424900121986866,
      "epoch": 0.03459935155164428,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016889924881979823,
      "kl": 0.0015171955456025898,
      "learning_rate": 9.9308939323761e-07,
      "loss": 0.0001,
      "num_tokens": 20712300.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 747,
      "step_time": 18.411602519452572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 160.0625,
      "completions/mean_terminated_length": 160.0625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.19160006195306778,
      "epoch": 0.03464566929133858,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006853294325992465,
      "kl": 0.0007444946531904861,
      "learning_rate": 9.93080129689671e-07,
      "loss": 0.0,
      "num_tokens": 20744205.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 748,
      "step_time": 18.91827342286706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 111.5,
      "completions/mean_terminated_length": 111.5,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.33665014058351517,
      "epoch": 0.034691987031032885,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003475069534033537,
      "kl": 0.0019420332391746342,
      "learning_rate": 9.930708661417322e-07,
      "loss": 0.0001,
      "num_tokens": 20764709.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 749,
      "step_time": 14.929662246257067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 114.75,
      "completions/mean_terminated_length": 114.75,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.30779921263456345,
      "epoch": 0.03473830477072719,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016567701241001487,
      "kl": 0.0011813818127848208,
      "learning_rate": 9.930616025937933e-07,
      "loss": 0.0001,
      "num_tokens": 20784625.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 750,
      "step_time": 13.047545105218887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 149.875,
      "completions/mean_terminated_length": 149.875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.30832144618034363,
      "epoch": 0.03478462251042149,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010846181539818645,
      "kl": 0.0011726339143933728,
      "learning_rate": 9.930523390458544e-07,
      "loss": 0.0001,
      "num_tokens": 20807743.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 751,
      "step_time": 18.087368704378605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 179.625,
      "completions/mean_terminated_length": 179.625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.2382570020854473,
      "epoch": 0.034830940250115794,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17337356507778168,
      "kl": 0.0009417200344614685,
      "learning_rate": 9.930430754979158e-07,
      "loss": -0.0338,
      "num_tokens": 20842793.0,
      "reward": 0.1939225196838379,
      "reward_std": 0.11004070192575455,
      "rewards/reward_func/mean": 0.1939225196838379,
      "rewards/reward_func/std": 0.11004070192575455,
      "step": 752,
      "step_time": 23.629357885569334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 176.875,
      "completions/mean_terminated_length": 176.875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2175726406276226,
      "epoch": 0.0348772579898101,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16890059411525726,
      "kl": 0.0011022739636246115,
      "learning_rate": 9.930338119499769e-07,
      "loss": -0.0042,
      "num_tokens": 20869639.0,
      "reward": 0.21134749054908752,
      "reward_std": 0.023851996287703514,
      "rewards/reward_func/mean": 0.21134749054908752,
      "rewards/reward_func/std": 0.023851994425058365,
      "step": 753,
      "step_time": 18.526702269911766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 159.9375,
      "completions/mean_terminated_length": 159.9375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.43599508702754974,
      "epoch": 0.0349235757295044,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010902105132117867,
      "kl": 0.0014562978176400065,
      "learning_rate": 9.93024548402038e-07,
      "loss": 0.0001,
      "num_tokens": 20915062.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 754,
      "step_time": 23.182676058262587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 213.625,
      "completions/mean_terminated_length": 213.625,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.31797099113464355,
      "epoch": 0.0349698934691987,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0610932894051075,
      "kl": 0.0012669971329160035,
      "learning_rate": 9.93015284854099e-07,
      "loss": 0.0008,
      "num_tokens": 20946928.0,
      "reward": 0.7376224994659424,
      "reward_std": 0.3595956563949585,
      "rewards/reward_func/mean": 0.7376224994659424,
      "rewards/reward_func/std": 0.3595956563949585,
      "step": 755,
      "step_time": 23.928455755114555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 174.875,
      "completions/mean_terminated_length": 174.875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4484062194824219,
      "epoch": 0.035016211208893006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010049432748928666,
      "kl": 0.0016918782494030893,
      "learning_rate": 9.930060213061603e-07,
      "loss": 0.0001,
      "num_tokens": 20970862.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 756,
      "step_time": 18.918332800269127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 133.1875,
      "completions/mean_terminated_length": 133.1875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.13485882431268692,
      "epoch": 0.03506252894858731,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10510154068470001,
      "kl": 0.0009273640462197363,
      "learning_rate": 9.929967577582214e-07,
      "loss": -0.0104,
      "num_tokens": 20991297.0,
      "reward": 0.11500650644302368,
      "reward_std": 0.31425464153289795,
      "rewards/reward_func/mean": 0.11500650644302368,
      "rewards/reward_func/std": 0.31425464153289795,
      "step": 757,
      "step_time": 14.219180513173342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 142.125,
      "completions/mean_terminated_length": 142.125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.21773843094706535,
      "epoch": 0.03510884668828161,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007945893448777497,
      "kl": 0.0009267694258596748,
      "learning_rate": 9.929874942102825e-07,
      "loss": 0.0,
      "num_tokens": 21013027.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 758,
      "step_time": 15.522979341447353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 134.0,
      "completions/mean_terminated_length": 134.0,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.338830903172493,
      "epoch": 0.035155164427975914,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001139418687671423,
      "kl": 0.0012303398980293423,
      "learning_rate": 9.929782306623436e-07,
      "loss": 0.0001,
      "num_tokens": 21033395.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 759,
      "step_time": 14.582605965435505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.32310692965984344,
      "epoch": 0.03520148216767022,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023779685143381357,
      "kl": 0.0016642717528156936,
      "learning_rate": 9.929689671144048e-07,
      "loss": 0.0001,
      "num_tokens": 21053390.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 760,
      "step_time": 17.098231252282858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 167.1875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.41315774619579315,
      "epoch": 0.03524779990736452,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005841231904923916,
      "kl": 0.0013927049294579774,
      "learning_rate": 9.929597035664659e-07,
      "loss": 0.0001,
      "num_tokens": 21087617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 761,
      "step_time": 22.515921484678984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 175.4375,
      "completions/mean_terminated_length": 175.4375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.21197232976555824,
      "epoch": 0.03529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06189983710646629,
      "kl": 0.0008826443227007985,
      "learning_rate": 9.92950440018527e-07,
      "loss": -0.0124,
      "num_tokens": 21109352.0,
      "reward": 0.59250807762146,
      "reward_std": 0.15573769807815552,
      "rewards/reward_func/mean": 0.59250807762146,
      "rewards/reward_func/std": 0.15573768317699432,
      "step": 762,
      "step_time": 18.795867145061493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 159.1875,
      "completions/mean_terminated_length": 159.1875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.33611395210027695,
      "epoch": 0.035340435386753126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012660189531743526,
      "kl": 0.0015209891716949642,
      "learning_rate": 9.929411764705881e-07,
      "loss": 0.0001,
      "num_tokens": 21146523.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 763,
      "step_time": 20.935311947017908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 180.5,
      "completions/mean_terminated_length": 180.5,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.21683895960450172,
      "epoch": 0.03538675312644743,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012084031477570534,
      "kl": 0.001167740294476971,
      "learning_rate": 9.929319129226493e-07,
      "loss": 0.0001,
      "num_tokens": 21168275.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 764,
      "step_time": 18.577101062983274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 128.875,
      "completions/mean_terminated_length": 128.875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.29378584772348404,
      "epoch": 0.03543307086614173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018863253062590957,
      "kl": 0.0017148111073765904,
      "learning_rate": 9.929226493747104e-07,
      "loss": 0.0001,
      "num_tokens": 21190977.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 765,
      "step_time": 14.933240331709385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 141.9375,
      "completions/mean_terminated_length": 141.9375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3214898705482483,
      "epoch": 0.035479388605836035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006536695291288197,
      "kl": 0.0010365339112468064,
      "learning_rate": 9.929133858267717e-07,
      "loss": 0.0001,
      "num_tokens": 21212544.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 766,
      "step_time": 15.943621318787336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 244.8125,
      "completions/mean_terminated_length": 244.8125,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "entropy": 0.28746990859508514,
      "epoch": 0.03552570634553034,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.061868999153375626,
      "kl": 0.0009608932887203991,
      "learning_rate": 9.929041222788328e-07,
      "loss": -0.022,
      "num_tokens": 21249981.0,
      "reward": 0.7940504550933838,
      "reward_std": 0.07485879957675934,
      "rewards/reward_func/mean": 0.7940504550933838,
      "rewards/reward_func/std": 0.07485879212617874,
      "step": 767,
      "step_time": 26.725659370422363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 125.6875,
      "completions/mean_terminated_length": 125.6875,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.27554355934262276,
      "epoch": 0.03557202408522464,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008337905164808035,
      "kl": 0.0012617142347153276,
      "learning_rate": 9.928948587308938e-07,
      "loss": 0.0001,
      "num_tokens": 21270920.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 768,
      "step_time": 13.756904244422913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 156.875,
      "completions/mean_terminated_length": 156.875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.39798150956630707,
      "epoch": 0.03561834182491894,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011346646351739764,
      "kl": 0.0016589273873250932,
      "learning_rate": 9.92885595182955e-07,
      "loss": 0.0001,
      "num_tokens": 21305014.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 769,
      "step_time": 20.33734503760934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 140.5,
      "completions/mean_terminated_length": 140.5,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3338210806250572,
      "epoch": 0.035664659564613246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009008368942886591,
      "kl": 0.0012040752044413239,
      "learning_rate": 9.928763316350162e-07,
      "loss": 0.0001,
      "num_tokens": 21331598.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 770,
      "step_time": 16.965304989367723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.0,
      "completions/max_terminated_length": 123.0,
      "completions/mean_length": 109.1875,
      "completions/mean_terminated_length": 109.1875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.28656987100839615,
      "epoch": 0.03571097730430755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009368361788801849,
      "kl": 0.00133035026374273,
      "learning_rate": 9.928670680870773e-07,
      "loss": 0.0001,
      "num_tokens": 21352081.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 771,
      "step_time": 12.301237791776657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 127.3125,
      "completions/mean_terminated_length": 127.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2899698242545128,
      "epoch": 0.03575729504400185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001047339290380478,
      "kl": 0.0013316216936800629,
      "learning_rate": 9.928578045391385e-07,
      "loss": 0.0001,
      "num_tokens": 21374518.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 772,
      "step_time": 14.309594821184874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 201.125,
      "completions/mean_terminated_length": 201.125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3939480558037758,
      "epoch": 0.035803612783696155,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00047841350897215307,
      "kl": 0.0012557295995065942,
      "learning_rate": 9.928485409911996e-07,
      "loss": 0.0001,
      "num_tokens": 21402280.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 773,
      "step_time": 21.371572624891996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 195.25,
      "completions/mean_terminated_length": 195.25,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3412705659866333,
      "epoch": 0.03584993052339046,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07474339753389359,
      "kl": 0.0010987457353621721,
      "learning_rate": 9.928392774432607e-07,
      "loss": -0.0085,
      "num_tokens": 21439276.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 774,
      "step_time": 24.488349922001362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 140.625,
      "completions/mean_terminated_length": 140.625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3591512441635132,
      "epoch": 0.03589624826308476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027559211011976004,
      "kl": 0.0017321380146313459,
      "learning_rate": 9.928300138953218e-07,
      "loss": 0.0001,
      "num_tokens": 21472214.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 775,
      "step_time": 18.36331943050027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 148.25,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.30273303389549255,
      "epoch": 0.035942566002779064,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005927165038883686,
      "kl": 0.0011400552175473422,
      "learning_rate": 9.92820750347383e-07,
      "loss": 0.0001,
      "num_tokens": 21494346.0,
      "reward": 0.47266900539398193,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.47266900539398193,
      "rewards/reward_func/std": 0.0,
      "step": 776,
      "step_time": 16.990239322185516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 167.9375,
      "completions/mean_terminated_length": 167.9375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.1891249492764473,
      "epoch": 0.03598888374247337,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09204047173261642,
      "kl": 0.000799347908468917,
      "learning_rate": 9.92811486799444e-07,
      "loss": -0.0961,
      "num_tokens": 21527049.0,
      "reward": 0.6444321870803833,
      "reward_std": 0.3238654136657715,
      "rewards/reward_func/mean": 0.6444321870803833,
      "rewards/reward_func/std": 0.3238654136657715,
      "step": 777,
      "step_time": 21.485658913850784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 118.375,
      "completions/mean_terminated_length": 118.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.22776512056589127,
      "epoch": 0.03603520148216767,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016181276878342032,
      "kl": 0.0009063722391147166,
      "learning_rate": 9.928022232515052e-07,
      "loss": 0.0,
      "num_tokens": 21548783.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 778,
      "step_time": 13.551093552261591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 170.625,
      "completions/mean_terminated_length": 170.625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.1654873713850975,
      "epoch": 0.03608151922186197,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07232765853404999,
      "kl": 0.0006980880134506151,
      "learning_rate": 9.927929597035666e-07,
      "loss": -0.1047,
      "num_tokens": 21570921.0,
      "reward": 0.33508291840553284,
      "reward_std": 0.3896864056587219,
      "rewards/reward_func/mean": 0.33508291840553284,
      "rewards/reward_func/std": 0.3896864354610443,
      "step": 779,
      "step_time": 18.786138746887445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 152.875,
      "completions/mean_terminated_length": 152.875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.24532657489180565,
      "epoch": 0.036127836961556276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022845608182251453,
      "kl": 0.0017486957367509604,
      "learning_rate": 9.927836961556275e-07,
      "loss": 0.0001,
      "num_tokens": 21600119.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 780,
      "step_time": 20.80349262431264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 128.875,
      "completions/mean_terminated_length": 128.875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.32380537688732147,
      "epoch": 0.03617415470125058,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006954511627554893,
      "kl": 0.0011527634633239359,
      "learning_rate": 9.927744326076886e-07,
      "loss": 0.0001,
      "num_tokens": 21620965.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 781,
      "step_time": 14.75655872002244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 135.375,
      "completions/mean_terminated_length": 135.375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2626483403146267,
      "epoch": 0.03622047244094488,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008991743670776486,
      "kl": 0.001040646922774613,
      "learning_rate": 9.9276516905975e-07,
      "loss": 0.0001,
      "num_tokens": 21643931.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 782,
      "step_time": 15.008624862879515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 148.375,
      "completions/mean_terminated_length": 148.375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.32240891456604004,
      "epoch": 0.036266790180639184,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012835003435611725,
      "kl": 0.001953296741703525,
      "learning_rate": 9.92755905511811e-07,
      "loss": 0.0001,
      "num_tokens": 21696177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 783,
      "step_time": 24.101244494318962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.6875,
      "completions/mean_terminated_length": 179.6875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3580075055360794,
      "epoch": 0.03631310792033349,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006679447251372039,
      "kl": 0.0010465997911524028,
      "learning_rate": 9.927466419638722e-07,
      "loss": 0.0001,
      "num_tokens": 21719980.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 784,
      "step_time": 19.346400436013937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 178.8125,
      "completions/mean_terminated_length": 178.8125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.43494129180908203,
      "epoch": 0.03635942566002779,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001725780894048512,
      "kl": 0.0016527536790817976,
      "learning_rate": 9.927373784159333e-07,
      "loss": 0.0001,
      "num_tokens": 21741257.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 785,
      "step_time": 18.63063418865204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 186.125,
      "completions/mean_terminated_length": 186.125,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.4212736263871193,
      "epoch": 0.03640574339972209,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013622586848214269,
      "kl": 0.0018332574982196093,
      "learning_rate": 9.927281148679944e-07,
      "loss": 0.0001,
      "num_tokens": 21770571.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 786,
      "step_time": 20.721955724060535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 184.25,
      "completions/mean_terminated_length": 184.25,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.21823961660265923,
      "epoch": 0.036452061139416396,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006008459022268653,
      "kl": 0.0008442414400633425,
      "learning_rate": 9.927188513200556e-07,
      "loss": 0.0,
      "num_tokens": 21821855.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 787,
      "step_time": 26.682371847331524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 214.6875,
      "completions/mean_terminated_length": 214.6875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.40522969514131546,
      "epoch": 0.0364983788791107,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07477609068155289,
      "kl": 0.0014336762833409011,
      "learning_rate": 9.927095877721167e-07,
      "loss": -0.0727,
      "num_tokens": 21847130.0,
      "reward": 0.018614530563354492,
      "reward_std": 0.027606507763266563,
      "rewards/reward_func/mean": 0.018614530563354492,
      "rewards/reward_func/std": 0.027606507763266563,
      "step": 788,
      "step_time": 23.462858349084854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 168.5,
      "completions/mean_terminated_length": 168.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3644990473985672,
      "epoch": 0.036544696618805,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012292582541704178,
      "kl": 0.001423132256604731,
      "learning_rate": 9.927003242241778e-07,
      "loss": 0.0001,
      "num_tokens": 21871554.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 789,
      "step_time": 18.672618698328733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 135.3125,
      "completions/mean_terminated_length": 135.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2719787135720253,
      "epoch": 0.036591014358499305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009575061849318445,
      "kl": 0.0013380526943365112,
      "learning_rate": 9.92691060676239e-07,
      "loss": 0.0001,
      "num_tokens": 21894455.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 790,
      "step_time": 14.909899402409792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 110.875,
      "completions/mean_terminated_length": 110.875,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.2577039450407028,
      "epoch": 0.03663733209819361,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015504512703046203,
      "kl": 0.0015930982190184295,
      "learning_rate": 9.926817971283e-07,
      "loss": 0.0001,
      "num_tokens": 21913701.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 791,
      "step_time": 12.476087305694818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 117.0,
      "completions/mean_terminated_length": 117.0,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 0.26193511486053467,
      "epoch": 0.03668364983788791,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015716665657237172,
      "kl": 0.0015727959398645908,
      "learning_rate": 9.926725335803612e-07,
      "loss": 0.0001,
      "num_tokens": 21933701.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 792,
      "step_time": 12.937334209680557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 179.125,
      "completions/mean_terminated_length": 179.125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.36753329634666443,
      "epoch": 0.036729967577582213,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006561006885021925,
      "kl": 0.0013610675669042394,
      "learning_rate": 9.926632700324223e-07,
      "loss": 0.0001,
      "num_tokens": 21955303.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 793,
      "step_time": 18.506981823593378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 132.0,
      "completions/mean_terminated_length": 132.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.23263446241617203,
      "epoch": 0.036776285317276516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011079427786171436,
      "kl": 0.0013293431256897748,
      "learning_rate": 9.926540064844834e-07,
      "loss": 0.0001,
      "num_tokens": 21974999.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 794,
      "step_time": 14.711909919977188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 159.8125,
      "completions/mean_terminated_length": 159.8125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.12268547713756561,
      "epoch": 0.03682260305697082,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06275269389152527,
      "kl": 0.000531562152900733,
      "learning_rate": 9.926447429365446e-07,
      "loss": -0.0501,
      "num_tokens": 22006564.0,
      "reward": 0.9305884838104248,
      "reward_std": 0.018509721383452415,
      "rewards/reward_func/mean": 0.9305884838104248,
      "rewards/reward_func/std": 0.01850973069667816,
      "step": 795,
      "step_time": 19.20914574339986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 125.4375,
      "completions/mean_terminated_length": 125.4375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27159298956394196,
      "epoch": 0.03686892079666512,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009578621829859912,
      "kl": 0.00117204335401766,
      "learning_rate": 9.926354793886059e-07,
      "loss": 0.0001,
      "num_tokens": 22026539.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 796,
      "step_time": 14.955506909638643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 199.625,
      "completions/mean_terminated_length": 199.625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.36915767937898636,
      "epoch": 0.036915238536359425,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000535281898919493,
      "kl": 0.0009660491195973009,
      "learning_rate": 9.92626215840667e-07,
      "loss": 0.0,
      "num_tokens": 22051605.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 797,
      "step_time": 20.611786134541035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 142.625,
      "completions/mean_terminated_length": 142.625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.21989630907773972,
      "epoch": 0.03696155627605373,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008252896368503571,
      "kl": 0.0009417905821464956,
      "learning_rate": 9.92616952292728e-07,
      "loss": 0.0,
      "num_tokens": 22071487.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 798,
      "step_time": 14.327708523720503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 152.0625,
      "completions/mean_terminated_length": 152.0625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3419434130191803,
      "epoch": 0.03700787401574803,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008594851242378354,
      "kl": 0.001281717763049528,
      "learning_rate": 9.926076887447893e-07,
      "loss": 0.0001,
      "num_tokens": 22096880.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 799,
      "step_time": 17.970007836818695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 196.0,
      "completions/mean_terminated_length": 196.0,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.4320857673883438,
      "epoch": 0.037054191755442334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022666838485747576,
      "kl": 0.0022287203755695373,
      "learning_rate": 9.925984251968504e-07,
      "loss": 0.0001,
      "num_tokens": 22126368.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 800,
      "step_time": 24.515904534608126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 142.875,
      "completions/mean_terminated_length": 142.875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.4241221025586128,
      "epoch": 0.03710050949513664,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011163548333570361,
      "kl": 0.0019402401230763644,
      "learning_rate": 9.925891616489115e-07,
      "loss": 0.0001,
      "num_tokens": 22156014.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 801,
      "step_time": 18.589754354208708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 163.6875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.17512871697545052,
      "epoch": 0.03714682723483094,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010291461367160082,
      "kl": 0.0008728425891604275,
      "learning_rate": 9.925798981009726e-07,
      "loss": 0.0,
      "num_tokens": 22176841.0,
      "reward": 0.7403417825698853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7403417825698853,
      "rewards/reward_func/std": 0.0,
      "step": 802,
      "step_time": 16.448123518377542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 377.0,
      "completions/max_terminated_length": 377.0,
      "completions/mean_length": 285.6875,
      "completions/mean_terminated_length": 285.6875,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "entropy": 0.26425112783908844,
      "epoch": 0.03719314497452524,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04466618597507477,
      "kl": 0.0007954746979521587,
      "learning_rate": 9.925706345530338e-07,
      "loss": -0.0642,
      "num_tokens": 22205924.0,
      "reward": 0.5974346399307251,
      "reward_std": 0.2332146167755127,
      "rewards/reward_func/mean": 0.5974346399307251,
      "rewards/reward_func/std": 0.2332146167755127,
      "step": 803,
      "step_time": 30.98456759750843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 177.0,
      "completions/mean_terminated_length": 177.0,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.15964826568961143,
      "epoch": 0.037239462714219546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001106057083234191,
      "kl": 0.0009501670720055699,
      "learning_rate": 9.925613710050949e-07,
      "loss": 0.0,
      "num_tokens": 22240180.0,
      "reward": 0.8702397346496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8702397346496582,
      "rewards/reward_func/std": 0.0,
      "step": 804,
      "step_time": 21.30280603468418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 208.5625,
      "completions/mean_terminated_length": 208.5625,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.39372440427541733,
      "epoch": 0.03728578045391385,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1070389375090599,
      "kl": 0.001414378290064633,
      "learning_rate": 9.92552107457156e-07,
      "loss": -0.0253,
      "num_tokens": 22265533.0,
      "reward": 0.5325166583061218,
      "reward_std": 0.4859153926372528,
      "rewards/reward_func/mean": 0.5325166583061218,
      "rewards/reward_func/std": 0.4859154224395752,
      "step": 805,
      "step_time": 22.00924064591527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 151.9375,
      "completions/mean_terminated_length": 151.9375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.27635348588228226,
      "epoch": 0.03733209819360815,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012643581721931696,
      "kl": 0.0012706111301667988,
      "learning_rate": 9.925428439092171e-07,
      "loss": 0.0001,
      "num_tokens": 22288812.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 806,
      "step_time": 17.406505286693573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 133.125,
      "completions/mean_terminated_length": 133.125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.24264325946569443,
      "epoch": 0.037378415933302454,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006564650102518499,
      "kl": 0.0009725986019475386,
      "learning_rate": 9.925335803612783e-07,
      "loss": 0.0,
      "num_tokens": 22308526.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 807,
      "step_time": 14.06134543940425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 169.8125,
      "completions/mean_terminated_length": 169.8125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4375000149011612,
      "epoch": 0.03742473367299676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009823600994423032,
      "kl": 0.0012600447225850075,
      "learning_rate": 9.925243168133394e-07,
      "loss": 0.0001,
      "num_tokens": 22341451.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 808,
      "step_time": 22.87074578180909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 165.6875,
      "completions/mean_terminated_length": 165.6875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.38376880437135696,
      "epoch": 0.03747105141269106,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005137338303029537,
      "kl": 0.0012045193143421784,
      "learning_rate": 9.925150532654007e-07,
      "loss": 0.0001,
      "num_tokens": 22367814.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 809,
      "step_time": 18.01202069595456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 132.0625,
      "completions/mean_terminated_length": 132.0625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2751566097140312,
      "epoch": 0.03751736915238536,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010438364697620273,
      "kl": 0.0010625261347740889,
      "learning_rate": 9.925057897174618e-07,
      "loss": 0.0001,
      "num_tokens": 22390007.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 810,
      "step_time": 14.121484663337469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 211.0,
      "completions/mean_terminated_length": 211.0,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.2260236032307148,
      "epoch": 0.037563686892079666,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000488622288685292,
      "kl": 0.0007573103066533804,
      "learning_rate": 9.924965261695228e-07,
      "loss": 0.0,
      "num_tokens": 22427959.0,
      "reward": 0.9383861422538757,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9383861422538757,
      "rewards/reward_func/std": 0.0,
      "step": 811,
      "step_time": 24.568120811134577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 190.75,
      "completions/mean_terminated_length": 190.75,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.2874022424221039,
      "epoch": 0.03761000463177397,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10655516386032104,
      "kl": 0.0017227102653123438,
      "learning_rate": 9.92487262621584e-07,
      "loss": -0.1044,
      "num_tokens": 22466147.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 812,
      "step_time": 29.970730647444725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 130.3125,
      "completions/mean_terminated_length": 130.3125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.24665077775716782,
      "epoch": 0.03765632237146827,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003576041432097554,
      "kl": 0.0016069425037130713,
      "learning_rate": 9.924779990736452e-07,
      "loss": 0.0001,
      "num_tokens": 22487624.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 813,
      "step_time": 14.225050505250692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 117.875,
      "completions/mean_terminated_length": 117.875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2610808089375496,
      "epoch": 0.037702640111162575,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008053500787355006,
      "kl": 0.0011654336849460378,
      "learning_rate": 9.924687355257063e-07,
      "loss": 0.0001,
      "num_tokens": 22508454.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 814,
      "step_time": 13.55789552256465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 124.6875,
      "completions/mean_terminated_length": 124.6875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2975633218884468,
      "epoch": 0.03774895785085688,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008555660024285316,
      "kl": 0.0011338264012010768,
      "learning_rate": 9.924594719777675e-07,
      "loss": 0.0001,
      "num_tokens": 22528577.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 815,
      "step_time": 13.34862768650055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 160.1875,
      "completions/mean_terminated_length": 160.1875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.42140527814626694,
      "epoch": 0.03779527559055118,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007337057613767684,
      "kl": 0.0013057987089268863,
      "learning_rate": 9.924502084298286e-07,
      "loss": 0.0001,
      "num_tokens": 22557012.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 816,
      "step_time": 20.276419568806887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 164.1875,
      "completions/mean_terminated_length": 164.1875,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.2034543715417385,
      "epoch": 0.037841593330245483,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007220017723739147,
      "kl": 0.000908565940335393,
      "learning_rate": 9.924409448818897e-07,
      "loss": 0.0,
      "num_tokens": 22589927.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 817,
      "step_time": 19.533989932388067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 191.625,
      "completions/mean_terminated_length": 191.625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.32018114626407623,
      "epoch": 0.037887911069939786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006020690780133009,
      "kl": 0.0010532069572946057,
      "learning_rate": 9.924316813339508e-07,
      "loss": 0.0001,
      "num_tokens": 22617281.0,
      "reward": 0.020545542240142822,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.020545542240142822,
      "rewards/reward_func/std": 0.0,
      "step": 818,
      "step_time": 20.884681150317192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 134.75,
      "completions/mean_terminated_length": 134.75,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3311654031276703,
      "epoch": 0.03793422880963409,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013042479986324906,
      "kl": 0.0016282762517221272,
      "learning_rate": 9.92422417786012e-07,
      "loss": 0.0001,
      "num_tokens": 22638445.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 819,
      "step_time": 15.344467476010323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 138.1875,
      "completions/mean_terminated_length": 138.1875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.28598906099796295,
      "epoch": 0.03798054654932839,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000775039370637387,
      "kl": 0.0010381936153862625,
      "learning_rate": 9.92413154238073e-07,
      "loss": 0.0001,
      "num_tokens": 22660192.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 820,
      "step_time": 15.39529787749052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 122.1875,
      "completions/mean_terminated_length": 122.1875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3442473039031029,
      "epoch": 0.038026864289022695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001066489377990365,
      "kl": 0.0014191086229402572,
      "learning_rate": 9.924038906901342e-07,
      "loss": 0.0001,
      "num_tokens": 22682227.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 821,
      "step_time": 14.344931341707706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 167.0,
      "completions/mean_terminated_length": 167.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3186531886458397,
      "epoch": 0.038073182028717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001553441397845745,
      "kl": 0.001338458008831367,
      "learning_rate": 9.923946271421956e-07,
      "loss": 0.0001,
      "num_tokens": 22705123.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 822,
      "step_time": 19.846717324107885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 196.875,
      "completions/mean_terminated_length": 196.875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.371276393532753,
      "epoch": 0.0381194997684113,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06931908428668976,
      "kl": 0.0010701469436753541,
      "learning_rate": 9.923853635942565e-07,
      "loss": -0.0327,
      "num_tokens": 22726433.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 823,
      "step_time": 22.185349114239216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 151.25,
      "completions/mean_terminated_length": 151.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.41176582127809525,
      "epoch": 0.038165817508105604,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006509088561870158,
      "kl": 0.0011380686919437721,
      "learning_rate": 9.923761000463176e-07,
      "loss": 0.0001,
      "num_tokens": 22751893.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 824,
      "step_time": 16.74121941626072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 195.3125,
      "completions/mean_terminated_length": 195.3125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4078410267829895,
      "epoch": 0.03821213524779991,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006997660966590047,
      "kl": 0.0013196228246670216,
      "learning_rate": 9.923668364983787e-07,
      "loss": 0.0001,
      "num_tokens": 22773690.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 825,
      "step_time": 23.617992267012596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 211.3125,
      "completions/mean_terminated_length": 211.3125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.4072626978158951,
      "epoch": 0.03825845298749421,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08284235745668411,
      "kl": 0.0013661879929713905,
      "learning_rate": 9.9235757295044e-07,
      "loss": 0.0624,
      "num_tokens": 22799215.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 826,
      "step_time": 23.910189773887396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 128.125,
      "completions/mean_terminated_length": 128.125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.24906330555677414,
      "epoch": 0.03830477072718851,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009547322406433523,
      "kl": 0.0009482608147663996,
      "learning_rate": 9.923483094025012e-07,
      "loss": 0.0,
      "num_tokens": 22818913.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 827,
      "step_time": 14.135732557624578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 110.5,
      "completions/mean_terminated_length": 110.5,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.23942997679114342,
      "epoch": 0.038351088466882816,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010104505345225334,
      "kl": 0.0011649180960375816,
      "learning_rate": 9.923390458545623e-07,
      "loss": 0.0001,
      "num_tokens": 22838713.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 828,
      "step_time": 13.335768409073353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 148.8125,
      "completions/mean_terminated_length": 148.8125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3456391915678978,
      "epoch": 0.03839740620657712,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007521145744249225,
      "kl": 0.001152332144556567,
      "learning_rate": 9.923297823066234e-07,
      "loss": 0.0001,
      "num_tokens": 22863366.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 829,
      "step_time": 16.91843691468239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 206.375,
      "completions/mean_terminated_length": 206.375,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.23295189067721367,
      "epoch": 0.03844372394627142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005820674123242497,
      "kl": 0.0009615173185011372,
      "learning_rate": 9.923205187586846e-07,
      "loss": 0.0,
      "num_tokens": 22887772.0,
      "reward": 0.4008028209209442,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4008028209209442,
      "rewards/reward_func/std": 0.0,
      "step": 830,
      "step_time": 20.30594377592206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 185.125,
      "completions/mean_terminated_length": 185.125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.32865454256534576,
      "epoch": 0.038490041685965724,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06871698051691055,
      "kl": 0.0013664969301316887,
      "learning_rate": 9.923112552107457e-07,
      "loss": -0.051,
      "num_tokens": 22918766.0,
      "reward": 0.019764235243201256,
      "reward_std": 0.07905694842338562,
      "rewards/reward_func/mean": 0.019764235243201256,
      "rewards/reward_func/std": 0.07905694097280502,
      "step": 831,
      "step_time": 21.14786373078823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 136.0625,
      "completions/mean_terminated_length": 136.0625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.34320181608200073,
      "epoch": 0.03853635942566003,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007403282797895372,
      "kl": 0.001321841060416773,
      "learning_rate": 9.923019916628068e-07,
      "loss": 0.0001,
      "num_tokens": 22939375.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 832,
      "step_time": 15.485927652567625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 183.25,
      "completions/mean_terminated_length": 183.25,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.40896496176719666,
      "epoch": 0.03858267716535433,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006073734839446843,
      "kl": 0.001216251141158864,
      "learning_rate": 9.92292728114868e-07,
      "loss": 0.0001,
      "num_tokens": 22967875.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 833,
      "step_time": 22.568044397979975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 188.25,
      "completions/mean_terminated_length": 188.25,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.36554165929555893,
      "epoch": 0.03862899490504863,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001040755887515843,
      "kl": 0.0014154599339235574,
      "learning_rate": 9.92283464566929e-07,
      "loss": 0.0001,
      "num_tokens": 22991287.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 834,
      "step_time": 18.86409217491746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 130.75,
      "completions/mean_terminated_length": 130.75,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.29438622295856476,
      "epoch": 0.038675312644742936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009892889065667987,
      "kl": 0.0012028044293401763,
      "learning_rate": 9.922742010189902e-07,
      "loss": 0.0001,
      "num_tokens": 23016003.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 835,
      "step_time": 15.510872017592192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 137.25,
      "completions/mean_terminated_length": 137.25,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3107244223356247,
      "epoch": 0.03872163038443724,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019122130470350385,
      "kl": 0.0014791909197811037,
      "learning_rate": 9.922649374710513e-07,
      "loss": 0.0001,
      "num_tokens": 23051767.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 836,
      "step_time": 18.339373033493757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 254.5625,
      "completions/mean_terminated_length": 254.5625,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "entropy": 0.3048095479607582,
      "epoch": 0.03876794812413154,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06858941167593002,
      "kl": 0.0011206775961909443,
      "learning_rate": 9.922556739231124e-07,
      "loss": 0.0549,
      "num_tokens": 23082720.0,
      "reward": 0.509719967842102,
      "reward_std": 0.32573890686035156,
      "rewards/reward_func/mean": 0.509719967842102,
      "rewards/reward_func/std": 0.32573890686035156,
      "step": 837,
      "step_time": 27.66253012046218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 214.25,
      "completions/mean_terminated_length": 214.25,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.39568687975406647,
      "epoch": 0.038814265863825845,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08459905534982681,
      "kl": 0.0010677657701307908,
      "learning_rate": 9.922464103751736e-07,
      "loss": -0.0867,
      "num_tokens": 23104692.0,
      "reward": 0.2371823638677597,
      "reward_std": 0.4242846965789795,
      "rewards/reward_func/mean": 0.2371823638677597,
      "rewards/reward_func/std": 0.4242846965789795,
      "step": 838,
      "step_time": 21.718665331602097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 183.3125,
      "completions/mean_terminated_length": 183.3125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3188238888978958,
      "epoch": 0.03886058360352015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09563377499580383,
      "kl": 0.0014460600505117327,
      "learning_rate": 9.92237146827235e-07,
      "loss": -0.025,
      "num_tokens": 23126137.0,
      "reward": 0.4231693744659424,
      "reward_std": 0.43837499618530273,
      "rewards/reward_func/mean": 0.4231693744659424,
      "rewards/reward_func/std": 0.43837499618530273,
      "step": 839,
      "step_time": 19.095177225768566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 126.0,
      "completions/mean_terminated_length": 126.0,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2542569972574711,
      "epoch": 0.03890690134321445,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008802962256595492,
      "kl": 0.001020480354782194,
      "learning_rate": 9.92227883279296e-07,
      "loss": 0.0001,
      "num_tokens": 23146905.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 840,
      "step_time": 13.482799373567104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 179.5,
      "completions/mean_terminated_length": 179.5,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.21041544526815414,
      "epoch": 0.038953219082908754,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0753721222281456,
      "kl": 0.0010078297345899045,
      "learning_rate": 9.92218619731357e-07,
      "loss": 0.0082,
      "num_tokens": 23175985.0,
      "reward": 0.9868549108505249,
      "reward_std": 0.03591921180486679,
      "rewards/reward_func/mean": 0.9868549108505249,
      "rewards/reward_func/std": 0.035919200628995895,
      "step": 841,
      "step_time": 20.711773075163364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 122.75,
      "completions/mean_terminated_length": 122.75,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.24506905302405357,
      "epoch": 0.038999536822603056,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001000660820864141,
      "kl": 0.0011933351051993668,
      "learning_rate": 9.922093561834183e-07,
      "loss": 0.0001,
      "num_tokens": 23195421.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 842,
      "step_time": 14.039854612201452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 194.6875,
      "completions/mean_terminated_length": 194.6875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.3695313408970833,
      "epoch": 0.03904585456229736,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011918543605133891,
      "kl": 0.0014249386440496892,
      "learning_rate": 9.922000926354794e-07,
      "loss": 0.0001,
      "num_tokens": 23221464.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 843,
      "step_time": 21.10698227584362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 294.9375,
      "completions/mean_terminated_length": 294.9375,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "entropy": 0.21876288205385208,
      "epoch": 0.03909217230199166,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06910669058561325,
      "kl": 0.0007599894743179902,
      "learning_rate": 9.921908290875405e-07,
      "loss": -0.0502,
      "num_tokens": 23250951.0,
      "reward": 0.6757249236106873,
      "reward_std": 0.26562076807022095,
      "rewards/reward_func/mean": 0.6757249236106873,
      "rewards/reward_func/std": 0.26562079787254333,
      "step": 844,
      "step_time": 29.39313641563058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 145.1875,
      "completions/mean_terminated_length": 145.1875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3069569543004036,
      "epoch": 0.039138490041685965,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016099393833428621,
      "kl": 0.0014371881261467934,
      "learning_rate": 9.921815655396016e-07,
      "loss": 0.0001,
      "num_tokens": 23278538.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 845,
      "step_time": 17.548515994101763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 149.8125,
      "completions/mean_terminated_length": 149.8125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.17683753743767738,
      "epoch": 0.03918480778138027,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006053652614355087,
      "kl": 0.0007061581200105138,
      "learning_rate": 9.921723019916628e-07,
      "loss": 0.0,
      "num_tokens": 23302695.0,
      "reward": 0.8883547186851501,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8883547186851501,
      "rewards/reward_func/std": 0.0,
      "step": 846,
      "step_time": 16.126937676221132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 130.875,
      "completions/mean_terminated_length": 130.875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.3190002292394638,
      "epoch": 0.03923112552107457,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012537972070276737,
      "kl": 0.0014405875408556312,
      "learning_rate": 9.92163038443724e-07,
      "loss": 0.0001,
      "num_tokens": 23322405.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 847,
      "step_time": 16.301903445273638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 177.6875,
      "completions/mean_terminated_length": 177.6875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.2442517653107643,
      "epoch": 0.039277443260768874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016692023491486907,
      "kl": 0.0013725541066378355,
      "learning_rate": 9.92153774895785e-07,
      "loss": 0.0001,
      "num_tokens": 23345440.0,
      "reward": 0.4111122786998749,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4111122786998749,
      "rewards/reward_func/std": 0.0,
      "step": 848,
      "step_time": 17.377079091966152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 192.0625,
      "completions/mean_terminated_length": 192.0625,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.20605628937482834,
      "epoch": 0.03932376100046318,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08510545641183853,
      "kl": 0.0008309491677209735,
      "learning_rate": 9.921445113478461e-07,
      "loss": -0.0259,
      "num_tokens": 23383105.0,
      "reward": 0.3339391350746155,
      "reward_std": 0.0740714743733406,
      "rewards/reward_func/mean": 0.3339391350746155,
      "rewards/reward_func/std": 0.0740714892745018,
      "step": 849,
      "step_time": 22.711984291672707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 156.5,
      "completions/mean_terminated_length": 156.5,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.22407619282603264,
      "epoch": 0.03937007874015748,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011288082459941506,
      "kl": 0.0011935117654502392,
      "learning_rate": 9.921352477999073e-07,
      "loss": 0.0001,
      "num_tokens": 23403849.0,
      "reward": 0.5488116145133972,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5488116145133972,
      "rewards/reward_func/std": 0.0,
      "step": 850,
      "step_time": 15.657405402511358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 206.625,
      "completions/mean_terminated_length": 206.625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.2101145051419735,
      "epoch": 0.03941639647985178,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05516713857650757,
      "kl": 0.0008015447965590283,
      "learning_rate": 9.921259842519684e-07,
      "loss": -0.013,
      "num_tokens": 23427795.0,
      "reward": 0.9432417750358582,
      "reward_std": 0.022156143561005592,
      "rewards/reward_func/mean": 0.9432417750358582,
      "rewards/reward_func/std": 0.022156143561005592,
      "step": 851,
      "step_time": 20.915113903582096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 174.1875,
      "completions/mean_terminated_length": 174.1875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.34642454981803894,
      "epoch": 0.039462714219546086,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10046864300966263,
      "kl": 0.002284130488988012,
      "learning_rate": 9.921167207040297e-07,
      "loss": -0.0457,
      "num_tokens": 23464934.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 852,
      "step_time": 23.169690739363432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 178.25,
      "completions/mean_terminated_length": 178.25,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.2737343981862068,
      "epoch": 0.03950903195924039,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09689640998840332,
      "kl": 0.0012106007779948413,
      "learning_rate": 9.921074571560909e-07,
      "loss": -0.022,
      "num_tokens": 23499866.0,
      "reward": 0.919789731502533,
      "reward_std": 0.07305874675512314,
      "rewards/reward_func/mean": 0.919789731502533,
      "rewards/reward_func/std": 0.07305874675512314,
      "step": 853,
      "step_time": 20.427601240575314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 174.625,
      "completions/mean_terminated_length": 174.625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.1593393310904503,
      "epoch": 0.03955534969893469,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00044821377377957106,
      "kl": 0.0005306711173034273,
      "learning_rate": 9.920981936081518e-07,
      "loss": 0.0,
      "num_tokens": 23527508.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 854,
      "step_time": 19.216147657483816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 173.625,
      "completions/mean_terminated_length": 173.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.25104890763759613,
      "epoch": 0.039601667438628994,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007444969844073057,
      "kl": 0.0022193838376551867,
      "learning_rate": 9.920889300602129e-07,
      "loss": 0.0001,
      "num_tokens": 23565038.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 855,
      "step_time": 21.615963652729988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 180.0625,
      "completions/mean_terminated_length": 180.0625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3022039160132408,
      "epoch": 0.0396479851783233,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07054363191127777,
      "kl": 0.0011581739236135036,
      "learning_rate": 9.920796665122742e-07,
      "loss": 0.0049,
      "num_tokens": 23596463.0,
      "reward": 0.21576446294784546,
      "reward_std": 0.37498414516448975,
      "rewards/reward_func/mean": 0.21576446294784546,
      "rewards/reward_func/std": 0.37498414516448975,
      "step": 856,
      "step_time": 20.23256105557084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 220.875,
      "completions/mean_terminated_length": 220.875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.28507737815380096,
      "epoch": 0.0396943029180176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06856705248355865,
      "kl": 0.0010991398594342172,
      "learning_rate": 9.920704029643354e-07,
      "loss": -0.0638,
      "num_tokens": 23628525.0,
      "reward": 0.08405046164989471,
      "reward_std": 0.0403740294277668,
      "rewards/reward_func/mean": 0.08405046164989471,
      "rewards/reward_func/std": 0.0403740257024765,
      "step": 857,
      "step_time": 25.41255698353052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.23209454864263535,
      "epoch": 0.0397406206577119,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07368174940347672,
      "kl": 0.0009128926321864128,
      "learning_rate": 9.920611394163965e-07,
      "loss": 0.0181,
      "num_tokens": 23685775.0,
      "reward": 0.8295896053314209,
      "reward_std": 0.00487559475004673,
      "rewards/reward_func/mean": 0.8295896053314209,
      "rewards/reward_func/std": 0.004875591490417719,
      "step": 858,
      "step_time": 27.572569452226162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 159.6875,
      "completions/mean_terminated_length": 159.6875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.33524635434150696,
      "epoch": 0.039786938397406206,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000995760434307158,
      "kl": 0.0011498018284328282,
      "learning_rate": 9.920518758684576e-07,
      "loss": 0.0001,
      "num_tokens": 23720682.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 859,
      "step_time": 19.58662161231041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 130.25,
      "completions/mean_terminated_length": 130.25,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.24422408640384674,
      "epoch": 0.03983325613710051,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007869779947213829,
      "kl": 0.0008795668691163883,
      "learning_rate": 9.920426123205187e-07,
      "loss": 0.0,
      "num_tokens": 23746990.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 860,
      "step_time": 15.046861194074154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 177.9375,
      "completions/mean_terminated_length": 177.9375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.35781367123126984,
      "epoch": 0.03987957387679481,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003613840788602829,
      "kl": 0.002170583524275571,
      "learning_rate": 9.920333487725799e-07,
      "loss": 0.0001,
      "num_tokens": 23776781.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 861,
      "step_time": 21.053212970495224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 357.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 298.0625,
      "completions/mean_terminated_length": 298.0625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "entropy": 0.3049522712826729,
      "epoch": 0.039925891616489115,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.058802779763936996,
      "kl": 0.001043334777932614,
      "learning_rate": 9.92024085224641e-07,
      "loss": -0.0937,
      "num_tokens": 23807566.0,
      "reward": 0.8277145624160767,
      "reward_std": 0.3702131509780884,
      "rewards/reward_func/mean": 0.8277145624160767,
      "rewards/reward_func/std": 0.3702131509780884,
      "step": 862,
      "step_time": 29.90588680282235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 171.5,
      "completions/mean_terminated_length": 171.5,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.13397923856973648,
      "epoch": 0.03997220935618342,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001241574645973742,
      "kl": 0.0006542782066389918,
      "learning_rate": 9.92014821676702e-07,
      "loss": 0.0,
      "num_tokens": 23839910.0,
      "reward": 0.8914703726768494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8914703726768494,
      "rewards/reward_func/std": 0.0,
      "step": 863,
      "step_time": 19.784176409244537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 121.125,
      "completions/mean_terminated_length": 121.125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2424134723842144,
      "epoch": 0.04001852709587772,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008954207296483219,
      "kl": 0.0009859423444140702,
      "learning_rate": 9.920055581287632e-07,
      "loss": 0.0,
      "num_tokens": 23859256.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 864,
      "step_time": 13.532780464738607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 142.375,
      "completions/mean_terminated_length": 142.375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3090411126613617,
      "epoch": 0.040064844835572024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008063865243457258,
      "kl": 0.0012019427958875895,
      "learning_rate": 9.919962945808244e-07,
      "loss": 0.0001,
      "num_tokens": 23891886.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 865,
      "step_time": 18.694202043116093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 198.5625,
      "completions/mean_terminated_length": 198.5625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.38849782943725586,
      "epoch": 0.040111162575266326,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007156712817959487,
      "kl": 0.001291893218876794,
      "learning_rate": 9.919870310328855e-07,
      "loss": 0.0001,
      "num_tokens": 23919591.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 866,
      "step_time": 20.873049806803465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 118.0,
      "completions/mean_terminated_length": 118.0,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.27741505205631256,
      "epoch": 0.04015748031496063,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007374704582616687,
      "kl": 0.001058865716913715,
      "learning_rate": 9.919777674849466e-07,
      "loss": 0.0001,
      "num_tokens": 23939639.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 867,
      "step_time": 12.763438243418932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 157.9375,
      "completions/mean_terminated_length": 157.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3621453046798706,
      "epoch": 0.04020379805465493,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011747012613341212,
      "kl": 0.0012898755667265505,
      "learning_rate": 9.919685039370077e-07,
      "loss": 0.0001,
      "num_tokens": 23961366.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 868,
      "step_time": 16.606018260121346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 194.1875,
      "completions/mean_terminated_length": 194.1875,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.15809489414095879,
      "epoch": 0.040250115794349235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15900906920433044,
      "kl": 0.0012111798860132694,
      "learning_rate": 9.91959240389069e-07,
      "loss": -0.0648,
      "num_tokens": 23985497.0,
      "reward": 0.8437808156013489,
      "reward_std": 0.1455288976430893,
      "rewards/reward_func/mean": 0.8437808156013489,
      "rewards/reward_func/std": 0.1455289125442505,
      "step": 869,
      "step_time": 19.615712836384773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 178.5,
      "completions/mean_terminated_length": 178.5,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4628566801548004,
      "epoch": 0.04029643353404354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09006517380475998,
      "kl": 0.0014835805050097406,
      "learning_rate": 9.919499768411302e-07,
      "loss": 0.0441,
      "num_tokens": 24010481.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 870,
      "step_time": 19.705397214740515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 180.375,
      "completions/mean_terminated_length": 180.375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.14962278679013252,
      "epoch": 0.04034275127373784,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06065298616886139,
      "kl": 0.0007261905120685697,
      "learning_rate": 9.919407132931913e-07,
      "loss": -0.0105,
      "num_tokens": 24048807.0,
      "reward": 0.925041675567627,
      "reward_std": 0.01998889446258545,
      "rewards/reward_func/mean": 0.925041675567627,
      "rewards/reward_func/std": 0.0199888963252306,
      "step": 871,
      "step_time": 20.8767249584198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 140.0625,
      "completions/mean_terminated_length": 140.0625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3491576388478279,
      "epoch": 0.040389069013432144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018656151369214058,
      "kl": 0.0017743144708219916,
      "learning_rate": 9.919314497452524e-07,
      "loss": 0.0001,
      "num_tokens": 24081528.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 872,
      "step_time": 18.238072484731674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 138.0,
      "completions/mean_terminated_length": 138.0,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3172079026699066,
      "epoch": 0.04043538675312645,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008050501928664744,
      "kl": 0.0010919965570792556,
      "learning_rate": 9.919221861973136e-07,
      "loss": 0.0001,
      "num_tokens": 24107192.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 873,
      "step_time": 15.443362895399332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 135.4375,
      "completions/mean_terminated_length": 135.4375,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2499684989452362,
      "epoch": 0.04048170449282075,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015567787922918797,
      "kl": 0.001300970237934962,
      "learning_rate": 9.919129226493747e-07,
      "loss": 0.0001,
      "num_tokens": 24127535.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 874,
      "step_time": 15.023460488766432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 135.9375,
      "completions/mean_terminated_length": 135.9375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.33585429191589355,
      "epoch": 0.04052802223251505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001978588756173849,
      "kl": 0.0015191614511422813,
      "learning_rate": 9.919036591014358e-07,
      "loss": 0.0001,
      "num_tokens": 24159726.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 875,
      "step_time": 16.590162009000778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 130.0,
      "completions/mean_terminated_length": 130.0,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.33138158917427063,
      "epoch": 0.040574339972209356,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008576564723625779,
      "kl": 0.0013629891618620604,
      "learning_rate": 9.91894395553497e-07,
      "loss": 0.0001,
      "num_tokens": 24185342.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 876,
      "step_time": 14.932226613163948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 156.4375,
      "completions/mean_terminated_length": 156.4375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2170252650976181,
      "epoch": 0.04062065771190366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016902198549360037,
      "kl": 0.0015073083341121674,
      "learning_rate": 9.91885132005558e-07,
      "loss": 0.0001,
      "num_tokens": 24207701.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 877,
      "step_time": 16.58178937062621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 214.25,
      "completions/mean_terminated_length": 214.25,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.2176058068871498,
      "epoch": 0.04066697545159796,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09435556083917618,
      "kl": 0.0012219191703479737,
      "learning_rate": 9.918758684576192e-07,
      "loss": 0.0961,
      "num_tokens": 24239897.0,
      "reward": 0.599349856376648,
      "reward_std": 0.23396223783493042,
      "rewards/reward_func/mean": 0.599349856376648,
      "rewards/reward_func/std": 0.23396222293376923,
      "step": 878,
      "step_time": 26.13662463799119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 128.1875,
      "completions/mean_terminated_length": 128.1875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.23211872950196266,
      "epoch": 0.040713293191292264,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012353787897154689,
      "kl": 0.0011770162091124803,
      "learning_rate": 9.918666049096803e-07,
      "loss": 0.0001,
      "num_tokens": 24259372.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 879,
      "step_time": 13.605142381042242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 150.25,
      "completions/mean_terminated_length": 150.25,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.35946550220251083,
      "epoch": 0.04075961093098657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010624685091897845,
      "kl": 0.0015476414700970054,
      "learning_rate": 9.918573413617414e-07,
      "loss": 0.0001,
      "num_tokens": 24291056.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 880,
      "step_time": 19.141439214348793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 176.625,
      "completions/mean_terminated_length": 176.625,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.31737447530031204,
      "epoch": 0.04080592867068087,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001965844538062811,
      "kl": 0.0018645181262400001,
      "learning_rate": 9.918480778138026e-07,
      "loss": 0.0001,
      "num_tokens": 24329018.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 881,
      "step_time": 21.36028627678752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 149.75,
      "completions/mean_terminated_length": 149.75,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.32912537455558777,
      "epoch": 0.04085224641037517,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013385425554588437,
      "kl": 0.0013619031524285674,
      "learning_rate": 9.91838814265864e-07,
      "loss": 0.0001,
      "num_tokens": 24357574.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 882,
      "step_time": 16.5103618837893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 219.25,
      "completions/mean_terminated_length": 219.25,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.2413436844944954,
      "epoch": 0.040898564150069476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006712871254421771,
      "kl": 0.0009202266519423574,
      "learning_rate": 9.91829550717925e-07,
      "loss": 0.0,
      "num_tokens": 24381178.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 883,
      "step_time": 21.478361073881388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 126.9375,
      "completions/mean_terminated_length": 126.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.29485518485307693,
      "epoch": 0.04094488188976378,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011903924169018865,
      "kl": 0.001347521523712203,
      "learning_rate": 9.918202871699862e-07,
      "loss": 0.0001,
      "num_tokens": 24413337.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 884,
      "step_time": 16.653850506991148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 137.375,
      "completions/mean_terminated_length": 137.375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3341088593006134,
      "epoch": 0.04099119962945808,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018057597335428,
      "kl": 0.0015847394533921033,
      "learning_rate": 9.91811023622047e-07,
      "loss": 0.0001,
      "num_tokens": 24449391.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 885,
      "step_time": 18.39985877275467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 180.0625,
      "completions/mean_terminated_length": 180.0625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.34564724564552307,
      "epoch": 0.041037517369152385,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11464457958936691,
      "kl": 0.0013137346832081676,
      "learning_rate": 9.918017600741084e-07,
      "loss": -0.0892,
      "num_tokens": 24473376.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 886,
      "step_time": 20.970414962619543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 125.6875,
      "completions/mean_terminated_length": 125.6875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.3436511158943176,
      "epoch": 0.04108383510884669,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011709880782291293,
      "kl": 0.001377139298710972,
      "learning_rate": 9.917924965261695e-07,
      "loss": 0.0001,
      "num_tokens": 24501003.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 887,
      "step_time": 16.413439992815256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.21886874735355377,
      "epoch": 0.04113015284854099,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08236662298440933,
      "kl": 0.0008434822811977938,
      "learning_rate": 9.917832329782307e-07,
      "loss": -0.0085,
      "num_tokens": 24534935.0,
      "reward": 0.9118726253509521,
      "reward_std": 0.052548982203006744,
      "rewards/reward_func/mean": 0.9118726253509521,
      "rewards/reward_func/std": 0.05254898592829704,
      "step": 888,
      "step_time": 20.45571358129382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 224.0625,
      "completions/mean_terminated_length": 224.0625,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.1623828411102295,
      "epoch": 0.041176470588235294,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005422509857453406,
      "kl": 0.0007415378058794886,
      "learning_rate": 9.917739694302918e-07,
      "loss": 0.0,
      "num_tokens": 24566520.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 889,
      "step_time": 23.425292938947678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4034350737929344,
      "epoch": 0.041222788327929596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009264025138691068,
      "kl": 0.0012167560926172882,
      "learning_rate": 9.91764705882353e-07,
      "loss": 0.0001,
      "num_tokens": 24592968.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 890,
      "step_time": 19.325445406138897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 111.0,
      "completions/mean_terminated_length": 111.0,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.23975902423262596,
      "epoch": 0.0412691060676239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001312562613748014,
      "kl": 0.0012075002596247941,
      "learning_rate": 9.91755442334414e-07,
      "loss": 0.0001,
      "num_tokens": 24612440.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 891,
      "step_time": 13.437476575374603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 121.75,
      "completions/mean_terminated_length": 121.75,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2583157308399677,
      "epoch": 0.0413154238073182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008441361715085804,
      "kl": 0.0011131021892651916,
      "learning_rate": 9.917461787864751e-07,
      "loss": 0.0001,
      "num_tokens": 24635716.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 892,
      "step_time": 13.729831136763096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 137.75,
      "completions/mean_terminated_length": 137.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2921217456459999,
      "epoch": 0.041361741547012505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00707467133179307,
      "kl": 0.0020684940682258457,
      "learning_rate": 9.917369152385363e-07,
      "loss": 0.0001,
      "num_tokens": 24660816.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 893,
      "step_time": 16.902304004877806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 175.0625,
      "completions/mean_terminated_length": 175.0625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.2442249245941639,
      "epoch": 0.04140805928670681,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06777264922857285,
      "kl": 0.000899084858247079,
      "learning_rate": 9.917276516905974e-07,
      "loss": -0.027,
      "num_tokens": 24683361.0,
      "reward": 0.8284921050071716,
      "reward_std": 0.2555558681488037,
      "rewards/reward_func/mean": 0.8284921050071716,
      "rewards/reward_func/std": 0.2555558979511261,
      "step": 894,
      "step_time": 18.19170006364584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 154.3125,
      "completions/mean_terminated_length": 154.3125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2531934566795826,
      "epoch": 0.04145437702640111,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00047968150465749204,
      "kl": 0.000688874781189952,
      "learning_rate": 9.917183881426585e-07,
      "loss": 0.0,
      "num_tokens": 24710294.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 895,
      "step_time": 18.35441016405821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 135.125,
      "completions/mean_terminated_length": 135.125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.26756370812654495,
      "epoch": 0.041500694766095414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011044122511520982,
      "kl": 0.0012441368453437462,
      "learning_rate": 9.917091245947199e-07,
      "loss": 0.0001,
      "num_tokens": 24731000.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 896,
      "step_time": 14.109066184610128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 119.5625,
      "completions/mean_terminated_length": 119.5625,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.21244856715202332,
      "epoch": 0.04154701250578972,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010753453243523836,
      "kl": 0.0012322746915742755,
      "learning_rate": 9.916998610467808e-07,
      "loss": 0.0001,
      "num_tokens": 24750241.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 897,
      "step_time": 13.320092979818583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 209.875,
      "completions/mean_terminated_length": 209.875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.3756285607814789,
      "epoch": 0.04159333024548402,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07290878891944885,
      "kl": 0.0015508253709413111,
      "learning_rate": 9.91690597498842e-07,
      "loss": -0.1088,
      "num_tokens": 24778655.0,
      "reward": 0.13992351293563843,
      "reward_std": 0.3008265197277069,
      "rewards/reward_func/mean": 0.13992351293563843,
      "rewards/reward_func/std": 0.3008265197277069,
      "step": 898,
      "step_time": 24.30946659296751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 151.75,
      "completions/mean_terminated_length": 151.75,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.33028341084718704,
      "epoch": 0.04163964798517832,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000989797175861895,
      "kl": 0.0011517624952830374,
      "learning_rate": 9.916813339509032e-07,
      "loss": 0.0001,
      "num_tokens": 24808507.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 899,
      "step_time": 17.414367869496346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 158.375,
      "completions/mean_terminated_length": 158.375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.2317407764494419,
      "epoch": 0.041685965724872626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010478079784661531,
      "kl": 0.001201534309075214,
      "learning_rate": 9.916720704029644e-07,
      "loss": 0.0001,
      "num_tokens": 24831665.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 900,
      "step_time": 16.36012415215373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 221.4375,
      "completions/mean_terminated_length": 221.4375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.2911301776766777,
      "epoch": 0.04173228346456693,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.057238928973674774,
      "kl": 0.0009754351049195975,
      "learning_rate": 9.916628068550255e-07,
      "loss": -0.0585,
      "num_tokens": 24858776.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 901,
      "step_time": 24.013755716383457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 180.0625,
      "completions/mean_terminated_length": 180.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.35609270632267,
      "epoch": 0.04177860120426123,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10632283985614777,
      "kl": 0.0011484703136375174,
      "learning_rate": 9.916535433070866e-07,
      "loss": -0.0499,
      "num_tokens": 24879705.0,
      "reward": 0.4157751798629761,
      "reward_std": 0.4877914488315582,
      "rewards/reward_func/mean": 0.4157751798629761,
      "rewards/reward_func/std": 0.4877914786338806,
      "step": 902,
      "step_time": 22.129552900791168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 122.875,
      "completions/mean_terminated_length": 122.875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.28633588552474976,
      "epoch": 0.041824918943955534,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007128144497983158,
      "kl": 0.0010221440315945074,
      "learning_rate": 9.916442797591477e-07,
      "loss": 0.0001,
      "num_tokens": 24900727.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 903,
      "step_time": 13.659861445426941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 198.625,
      "completions/mean_terminated_length": 198.625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.22553906217217445,
      "epoch": 0.04187123668364984,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002935833763331175,
      "kl": 0.0016055934393079951,
      "learning_rate": 9.916350162112089e-07,
      "loss": 0.0001,
      "num_tokens": 24923665.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 904,
      "step_time": 22.655254740267992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 345.0,
      "completions/max_terminated_length": 345.0,
      "completions/mean_length": 315.75,
      "completions/mean_terminated_length": 315.75,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "entropy": 0.25387056171894073,
      "epoch": 0.04191755442334414,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05357150733470917,
      "kl": 0.0011125316377729177,
      "learning_rate": 9.9162575266327e-07,
      "loss": -0.0406,
      "num_tokens": 24959293.0,
      "reward": 0.9125852584838867,
      "reward_std": 0.06320253014564514,
      "rewards/reward_func/mean": 0.9125852584838867,
      "rewards/reward_func/std": 0.06320253014564514,
      "step": 905,
      "step_time": 31.907665256410837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 163.375,
      "completions/mean_terminated_length": 163.375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.22939521074295044,
      "epoch": 0.04196387216303844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005068927421234548,
      "kl": 0.0008163191378116608,
      "learning_rate": 9.916164891153311e-07,
      "loss": 0.0,
      "num_tokens": 24983539.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 906,
      "step_time": 16.72318310290575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 117.25,
      "completions/mean_terminated_length": 117.25,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.21878432855010033,
      "epoch": 0.042010189902732746,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015956859569996595,
      "kl": 0.0012934014084748924,
      "learning_rate": 9.916072255673922e-07,
      "loss": 0.0001,
      "num_tokens": 25003031.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 907,
      "step_time": 13.752490423619747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 168.3125,
      "completions/mean_terminated_length": 168.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.22134746611118317,
      "epoch": 0.04205650764242705,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013118194183334708,
      "kl": 0.0009753816411830485,
      "learning_rate": 9.915979620194534e-07,
      "loss": 0.0,
      "num_tokens": 25024412.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 908,
      "step_time": 17.913808669894934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 217.375,
      "completions/mean_terminated_length": 217.375,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.18628476187586784,
      "epoch": 0.04210282538212135,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05642639100551605,
      "kl": 0.000570238204090856,
      "learning_rate": 9.915886984715145e-07,
      "loss": -0.0523,
      "num_tokens": 25046866.0,
      "reward": 0.5372694134712219,
      "reward_std": 0.10262186825275421,
      "rewards/reward_func/mean": 0.5372694134712219,
      "rewards/reward_func/std": 0.10262187570333481,
      "step": 909,
      "step_time": 21.101673137396574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 168.6875,
      "completions/mean_terminated_length": 168.6875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.46945177018642426,
      "epoch": 0.042149143121815655,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008493889472447336,
      "kl": 0.001389862794894725,
      "learning_rate": 9.915794349235756e-07,
      "loss": 0.0001,
      "num_tokens": 25074493.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 910,
      "step_time": 21.144909985363483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 119.375,
      "completions/mean_terminated_length": 119.375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.25054027885198593,
      "epoch": 0.04219546086150996,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000666202453430742,
      "kl": 0.001013650165987201,
      "learning_rate": 9.915701713756367e-07,
      "loss": 0.0001,
      "num_tokens": 25094355.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 911,
      "step_time": 13.317175682634115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 125.0625,
      "completions/mean_terminated_length": 125.0625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.28567124903202057,
      "epoch": 0.04224177860120426,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011674201814457774,
      "kl": 0.001341990107903257,
      "learning_rate": 9.91560907827698e-07,
      "loss": 0.0001,
      "num_tokens": 25117460.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 912,
      "step_time": 14.951915934681892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 137.0,
      "completions/mean_terminated_length": 137.0,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2821376770734787,
      "epoch": 0.042288096340898564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014422236708924174,
      "kl": 0.001285295671550557,
      "learning_rate": 9.915516442797592e-07,
      "loss": 0.0001,
      "num_tokens": 25139940.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 913,
      "step_time": 15.471151653677225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 132.3125,
      "completions/mean_terminated_length": 132.3125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.341471403837204,
      "epoch": 0.042334414080592866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000769183796364814,
      "kl": 0.0014067496813368052,
      "learning_rate": 9.915423807318203e-07,
      "loss": 0.0001,
      "num_tokens": 25161193.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 914,
      "step_time": 15.628828033804893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 190.0625,
      "completions/mean_terminated_length": 190.0625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3295915126800537,
      "epoch": 0.04238073182028717,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08101384341716766,
      "kl": 0.001092489721486345,
      "learning_rate": 9.915331171838812e-07,
      "loss": -0.0696,
      "num_tokens": 25182810.0,
      "reward": 0.37304767966270447,
      "reward_std": 0.3916044235229492,
      "rewards/reward_func/mean": 0.37304767966270447,
      "rewards/reward_func/std": 0.3916044235229492,
      "step": 915,
      "step_time": 22.14657584577799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 129.5,
      "completions/mean_terminated_length": 129.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2927909716963768,
      "epoch": 0.04242704955998147,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002993886824697256,
      "kl": 0.002071876573609188,
      "learning_rate": 9.915238536359426e-07,
      "loss": 0.0001,
      "num_tokens": 25207634.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 916,
      "step_time": 15.564940758049488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 126.375,
      "completions/mean_terminated_length": 126.375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2962787076830864,
      "epoch": 0.042473367299675775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030500530265271664,
      "kl": 0.0018200974445790052,
      "learning_rate": 9.915145900880037e-07,
      "loss": 0.0001,
      "num_tokens": 25227960.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 917,
      "step_time": 14.766521293669939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 121.625,
      "completions/mean_terminated_length": 121.625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2951975166797638,
      "epoch": 0.04251968503937008,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010513971792533994,
      "kl": 0.001188266251119785,
      "learning_rate": 9.915053265400648e-07,
      "loss": 0.0001,
      "num_tokens": 25256098.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 918,
      "step_time": 14.576378718018532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 183.1875,
      "completions/mean_terminated_length": 183.1875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.43721695989370346,
      "epoch": 0.04256600277906438,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005205922061577439,
      "kl": 0.0011477641965029761,
      "learning_rate": 9.91496062992126e-07,
      "loss": 0.0001,
      "num_tokens": 25293717.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 919,
      "step_time": 22.255840439349413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 171.375,
      "completions/mean_terminated_length": 171.375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3722553998231888,
      "epoch": 0.042612320518758684,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008447070722468197,
      "kl": 0.0011220781016163528,
      "learning_rate": 9.91486799444187e-07,
      "loss": 0.0001,
      "num_tokens": 25315771.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 920,
      "step_time": 17.35065533220768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 223.4375,
      "completions/mean_terminated_length": 223.4375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.19360385835170746,
      "epoch": 0.04265863825845299,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06590349227190018,
      "kl": 0.0006857907719677314,
      "learning_rate": 9.914775358962482e-07,
      "loss": -0.0534,
      "num_tokens": 25345906.0,
      "reward": 0.7841700911521912,
      "reward_std": 0.12175922840833664,
      "rewards/reward_func/mean": 0.7841700911521912,
      "rewards/reward_func/std": 0.12175923585891724,
      "step": 921,
      "step_time": 23.794354770332575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 186.3125,
      "completions/mean_terminated_length": 186.3125,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.15523645281791687,
      "epoch": 0.04270495599814729,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003852669906336814,
      "kl": 0.0005026564249419607,
      "learning_rate": 9.914682723483093e-07,
      "loss": 0.0,
      "num_tokens": 25383495.0,
      "reward": 0.8869204521179199,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8869204521179199,
      "rewards/reward_func/std": 0.0,
      "step": 922,
      "step_time": 22.65672117099166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.28473517298698425,
      "epoch": 0.04275127373784159,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000955732713919133,
      "kl": 0.0010942591761704534,
      "learning_rate": 9.914590088003704e-07,
      "loss": 0.0001,
      "num_tokens": 25408577.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 923,
      "step_time": 14.98143096268177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 125.8125,
      "completions/mean_terminated_length": 125.8125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.32515229284763336,
      "epoch": 0.042797591477535896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010904496302828193,
      "kl": 0.0014065119030419737,
      "learning_rate": 9.914497452524316e-07,
      "loss": 0.0001,
      "num_tokens": 25430094.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 924,
      "step_time": 15.248995453119278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 186.5625,
      "completions/mean_terminated_length": 186.5625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.19039393216371536,
      "epoch": 0.0428439092172302,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000549613730981946,
      "kl": 0.0007599204545840621,
      "learning_rate": 9.914404817044927e-07,
      "loss": 0.0,
      "num_tokens": 25469255.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 925,
      "step_time": 23.040058355778456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 154.5,
      "completions/mean_terminated_length": 154.5,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3542131409049034,
      "epoch": 0.0428902269569245,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012450783979147673,
      "kl": 0.0013468109245877713,
      "learning_rate": 9.91431218156554e-07,
      "loss": 0.0001,
      "num_tokens": 25489807.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 926,
      "step_time": 16.3294418156147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 329.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 273.3125,
      "completions/mean_terminated_length": 273.3125,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "entropy": 0.19528140500187874,
      "epoch": 0.042936544696618804,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08553256094455719,
      "kl": 0.0008833478495944291,
      "learning_rate": 9.914219546086152e-07,
      "loss": -0.1422,
      "num_tokens": 25520516.0,
      "reward": 0.4365028440952301,
      "reward_std": 0.3096548616886139,
      "rewards/reward_func/mean": 0.4365028440952301,
      "rewards/reward_func/std": 0.3096548616886139,
      "step": 927,
      "step_time": 28.269976779818535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 201.75,
      "completions/mean_terminated_length": 201.75,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.32308219373226166,
      "epoch": 0.04298286243631311,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06600859761238098,
      "kl": 0.0014190545916790143,
      "learning_rate": 9.91412691060676e-07,
      "loss": -0.015,
      "num_tokens": 25543296.0,
      "reward": 0.3001863360404968,
      "reward_std": 0.4620959162712097,
      "rewards/reward_func/mean": 0.3001863360404968,
      "rewards/reward_func/std": 0.4620959162712097,
      "step": 928,
      "step_time": 19.748896960169077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 182.4375,
      "completions/mean_terminated_length": 182.4375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3041607439517975,
      "epoch": 0.04302918017600741,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006013662205077708,
      "kl": 0.0010577643843134865,
      "learning_rate": 9.914034275127374e-07,
      "loss": 0.0001,
      "num_tokens": 25564583.0,
      "reward": 0.7958667874336243,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7958667874336243,
      "rewards/reward_func/std": 0.0,
      "step": 929,
      "step_time": 18.69086018204689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 193.6875,
      "completions/mean_terminated_length": 193.6875,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.2321130819618702,
      "epoch": 0.04307549791570171,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07265283912420273,
      "kl": 0.0011636123526841402,
      "learning_rate": 9.913941639647985e-07,
      "loss": -0.0249,
      "num_tokens": 25586082.0,
      "reward": 0.8816871643066406,
      "reward_std": 0.12416164577007294,
      "rewards/reward_func/mean": 0.8816871643066406,
      "rewards/reward_func/std": 0.12416165322065353,
      "step": 930,
      "step_time": 19.568458043038845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 116.0,
      "completions/mean_terminated_length": 116.0,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3222050368785858,
      "epoch": 0.043121815655396016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001881249132566154,
      "kl": 0.0021569784148596227,
      "learning_rate": 9.913849004168597e-07,
      "loss": 0.0001,
      "num_tokens": 25616818.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 931,
      "step_time": 15.721865832805634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 211.125,
      "completions/mean_terminated_length": 211.125,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.408104345202446,
      "epoch": 0.04316813339509032,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006213901797309518,
      "kl": 0.0011445400887168944,
      "learning_rate": 9.913756368689208e-07,
      "loss": 0.0001,
      "num_tokens": 25644132.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 932,
      "step_time": 21.428357008844614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 155.875,
      "completions/mean_terminated_length": 155.875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.31693416833877563,
      "epoch": 0.04321445113478462,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07095703482627869,
      "kl": 0.0012221091310493648,
      "learning_rate": 9.91366373320982e-07,
      "loss": 0.0111,
      "num_tokens": 25668402.0,
      "reward": 0.0486750490963459,
      "reward_std": 0.1947001814842224,
      "rewards/reward_func/mean": 0.0486750490963459,
      "rewards/reward_func/std": 0.1947001963853836,
      "step": 933,
      "step_time": 16.34353280812502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 202.75,
      "completions/mean_terminated_length": 202.75,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.25528978556394577,
      "epoch": 0.043260768874478925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009993150597438216,
      "kl": 0.001088865305064246,
      "learning_rate": 9.91357109773043e-07,
      "loss": 0.0001,
      "num_tokens": 25695598.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 934,
      "step_time": 21.84546685218811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 140.875,
      "completions/mean_terminated_length": 140.875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.27832359075546265,
      "epoch": 0.04330708661417323,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024308261927217245,
      "kl": 0.0015019922284409404,
      "learning_rate": 9.913478462251042e-07,
      "loss": 0.0001,
      "num_tokens": 25728604.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 935,
      "step_time": 17.569377820938826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 136.5625,
      "completions/mean_terminated_length": 136.5625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.39912573993206024,
      "epoch": 0.04335340435386753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009236956248059869,
      "kl": 0.0014983774744905531,
      "learning_rate": 9.913385826771653e-07,
      "loss": 0.0001,
      "num_tokens": 25752997.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 936,
      "step_time": 16.631429065018892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 203.4375,
      "completions/mean_terminated_length": 203.4375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.4405878186225891,
      "epoch": 0.043399722093561834,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007935868925414979,
      "kl": 0.001365014468319714,
      "learning_rate": 9.913293191292264e-07,
      "loss": 0.0001,
      "num_tokens": 25777132.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 937,
      "step_time": 20.493006374686956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 170.5,
      "completions/mean_terminated_length": 170.5,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3793506845831871,
      "epoch": 0.043446039833256136,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013016482116654515,
      "kl": 0.0016592135361861438,
      "learning_rate": 9.913200555812875e-07,
      "loss": 0.0001,
      "num_tokens": 25822580.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 938,
      "step_time": 23.562003422528505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 163.0,
      "completions/mean_terminated_length": 163.0,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3772532120347023,
      "epoch": 0.04349235757295044,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.131752610206604,
      "kl": 0.0012197851901873946,
      "learning_rate": 9.913107920333489e-07,
      "loss": -0.0841,
      "num_tokens": 25846276.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 939,
      "step_time": 19.540890879929066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 125.9375,
      "completions/mean_terminated_length": 125.9375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.1573271043598652,
      "epoch": 0.04353867531264474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000697934243362397,
      "kl": 0.0007628782768733799,
      "learning_rate": 9.913015284854098e-07,
      "loss": 0.0,
      "num_tokens": 25879619.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 940,
      "step_time": 16.907122440636158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 163.625,
      "completions/mean_terminated_length": 163.625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.260529488325119,
      "epoch": 0.043584993052339045,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10483313351869583,
      "kl": 0.0011952401837334037,
      "learning_rate": 9.91292264937471e-07,
      "loss": 0.0724,
      "num_tokens": 25907725.0,
      "reward": 0.877037763595581,
      "reward_std": 0.23387674987316132,
      "rewards/reward_func/mean": 0.877037763595581,
      "rewards/reward_func/std": 0.23387673497200012,
      "step": 941,
      "step_time": 21.299443446099758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 202.5625,
      "completions/mean_terminated_length": 202.5625,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.20744986832141876,
      "epoch": 0.04363131079203335,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07444166392087936,
      "kl": 0.0012948303774464875,
      "learning_rate": 9.912830013895322e-07,
      "loss": -0.0117,
      "num_tokens": 25945494.0,
      "reward": 0.8516945838928223,
      "reward_std": 0.22711853682994843,
      "rewards/reward_func/mean": 0.8516945838928223,
      "rewards/reward_func/std": 0.22711855173110962,
      "step": 942,
      "step_time": 24.27873231470585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 132.625,
      "completions/mean_terminated_length": 132.625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.32297907769680023,
      "epoch": 0.04367762853172765,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022987041156738997,
      "kl": 0.0015412152570206672,
      "learning_rate": 9.912737378415934e-07,
      "loss": 0.0001,
      "num_tokens": 25974688.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 943,
      "step_time": 17.78444692119956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 187.875,
      "completions/mean_terminated_length": 187.875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.29273275285959244,
      "epoch": 0.043723946271421954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006314238416962326,
      "kl": 0.0008948555623646826,
      "learning_rate": 9.912644742936545e-07,
      "loss": 0.0,
      "num_tokens": 26011630.0,
      "reward": 0.8507331609725952,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8507331609725952,
      "rewards/reward_func/std": 0.0,
      "step": 944,
      "step_time": 23.350538298487663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 171.125,
      "completions/mean_terminated_length": 171.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.23666785657405853,
      "epoch": 0.04377026401111626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015405794838443398,
      "kl": 0.0015227313851937652,
      "learning_rate": 9.912552107457156e-07,
      "loss": 0.0001,
      "num_tokens": 26045904.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 945,
      "step_time": 19.95811043307185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 142.375,
      "completions/mean_terminated_length": 142.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3757862076163292,
      "epoch": 0.04381658175081056,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013596073258668184,
      "kl": 0.0014998194528743625,
      "learning_rate": 9.912459471977767e-07,
      "loss": 0.0001,
      "num_tokens": 26065942.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 946,
      "step_time": 16.22507019340992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 152.25,
      "completions/mean_terminated_length": 152.25,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3453558087348938,
      "epoch": 0.04386289949050486,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001597587950527668,
      "kl": 0.0018345113785471767,
      "learning_rate": 9.912366836498379e-07,
      "loss": 0.0001,
      "num_tokens": 26120394.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 947,
      "step_time": 24.664816740900278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 204.8125,
      "completions/mean_terminated_length": 204.8125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.39521076530218124,
      "epoch": 0.043909217230199166,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08535200357437134,
      "kl": 0.001448167604394257,
      "learning_rate": 9.91227420101899e-07,
      "loss": -0.0268,
      "num_tokens": 26145319.0,
      "reward": 0.04770008474588394,
      "reward_std": 0.018802374601364136,
      "rewards/reward_func/mean": 0.04770008474588394,
      "rewards/reward_func/std": 0.018802374601364136,
      "step": 948,
      "step_time": 25.312162697315216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 164.0,
      "completions/mean_terminated_length": 164.0,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3368866443634033,
      "epoch": 0.04395553496989347,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009091639658436179,
      "kl": 0.0012967442453373224,
      "learning_rate": 9.912181565539601e-07,
      "loss": 0.0001,
      "num_tokens": 26182023.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 949,
      "step_time": 20.87411253899336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 133.5,
      "completions/mean_terminated_length": 133.5,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2637072578072548,
      "epoch": 0.04400185270958777,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008538886904716492,
      "kl": 0.0010562004172243178,
      "learning_rate": 9.912088930060212e-07,
      "loss": 0.0001,
      "num_tokens": 26203711.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 950,
      "step_time": 15.144162889569998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 127.875,
      "completions/mean_terminated_length": 127.875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2999601513147354,
      "epoch": 0.044048170449282074,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007343225297518075,
      "kl": 0.00115480792010203,
      "learning_rate": 9.911996294580824e-07,
      "loss": 0.0001,
      "num_tokens": 26227069.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 951,
      "step_time": 14.849574849009514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 154.125,
      "completions/mean_terminated_length": 154.125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.18190209940075874,
      "epoch": 0.04409448818897638,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08292274177074432,
      "kl": 0.0009367444145027548,
      "learning_rate": 9.911903659101435e-07,
      "loss": 0.0217,
      "num_tokens": 26249375.0,
      "reward": 0.9273681640625,
      "reward_std": 0.025895869359374046,
      "rewards/reward_func/mean": 0.9273681640625,
      "rewards/reward_func/std": 0.025895869359374046,
      "step": 952,
      "step_time": 16.659883372485638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 143.4375,
      "completions/mean_terminated_length": 143.4375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2833913266658783,
      "epoch": 0.04414080592867068,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024984402116388083,
      "kl": 0.0016165999695658684,
      "learning_rate": 9.911811023622046e-07,
      "loss": 0.0001,
      "num_tokens": 26270134.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 953,
      "step_time": 15.93040182814002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 164.375,
      "completions/mean_terminated_length": 164.375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.1734016165137291,
      "epoch": 0.04418712366836498,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007538609206676483,
      "kl": 0.0006580217886948958,
      "learning_rate": 9.911718388142657e-07,
      "loss": 0.0,
      "num_tokens": 26295244.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 954,
      "step_time": 17.942106883972883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 124.75,
      "completions/mean_terminated_length": 124.75,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.2591349147260189,
      "epoch": 0.044233441408059286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013043932849541306,
      "kl": 0.0014181759615894407,
      "learning_rate": 9.911625752663269e-07,
      "loss": 0.0001,
      "num_tokens": 26314840.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 955,
      "step_time": 14.206559136509895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 161.9375,
      "completions/mean_terminated_length": 161.9375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3751741051673889,
      "epoch": 0.04427975914775359,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019639788661152124,
      "kl": 0.002565555914770812,
      "learning_rate": 9.911533117183882e-07,
      "loss": 0.0001,
      "num_tokens": 26370039.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 956,
      "step_time": 26.0766384601593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.28951142728328705,
      "epoch": 0.04432607688744789,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008503881981596351,
      "kl": 0.0010200832184636965,
      "learning_rate": 9.911440481704493e-07,
      "loss": 0.0001,
      "num_tokens": 26389778.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 957,
      "step_time": 14.31048109382391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 209.375,
      "completions/mean_terminated_length": 209.375,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.1391860507428646,
      "epoch": 0.044372394627142195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003694234765134752,
      "kl": 0.0004186817241134122,
      "learning_rate": 9.911347846225102e-07,
      "loss": 0.0,
      "num_tokens": 26427240.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 958,
      "step_time": 23.268010932952166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 168.875,
      "completions/mean_terminated_length": 168.875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.17917417362332344,
      "epoch": 0.0444187123668365,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00046418776037171483,
      "kl": 0.0007251384085975587,
      "learning_rate": 9.911255210745716e-07,
      "loss": 0.0,
      "num_tokens": 26448486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 959,
      "step_time": 17.64775961264968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 557.0,
      "completions/max_terminated_length": 557.0,
      "completions/mean_length": 373.5625,
      "completions/mean_terminated_length": 373.5625,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.28814607486128807,
      "epoch": 0.0444650301065308,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05286726728081703,
      "kl": 0.001133385201683268,
      "learning_rate": 9.911162575266327e-07,
      "loss": 0.3167,
      "num_tokens": 26479039.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 960,
      "step_time": 43.64147626236081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 166.3125,
      "completions/mean_terminated_length": 166.3125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.1643184870481491,
      "epoch": 0.044511347846225104,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10498344153165817,
      "kl": 0.004419477394549176,
      "learning_rate": 9.911069939786938e-07,
      "loss": -0.0452,
      "num_tokens": 26506388.0,
      "reward": 0.8449504375457764,
      "reward_std": 0.21580785512924194,
      "rewards/reward_func/mean": 0.8449504375457764,
      "rewards/reward_func/std": 0.21580785512924194,
      "step": 961,
      "step_time": 18.347329638898373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 150.0,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.14045149832963943,
      "epoch": 0.044557665585919407,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08984079211950302,
      "kl": 0.000682404643157497,
      "learning_rate": 9.91097730430755e-07,
      "loss": -0.0536,
      "num_tokens": 26540084.0,
      "reward": 0.887914776802063,
      "reward_std": 0.1020917147397995,
      "rewards/reward_func/mean": 0.887914776802063,
      "rewards/reward_func/std": 0.1020917147397995,
      "step": 962,
      "step_time": 19.268025774508715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 150.9375,
      "completions/mean_terminated_length": 150.9375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3451591953635216,
      "epoch": 0.04460398332561371,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008214266272261739,
      "kl": 0.001269234111532569,
      "learning_rate": 9.91088466882816e-07,
      "loss": 0.0001,
      "num_tokens": 26576579.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 963,
      "step_time": 19.809270162135363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 147.75,
      "completions/mean_terminated_length": 147.75,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.33301592618227005,
      "epoch": 0.04465030106530801,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036126698832958937,
      "kl": 0.0016686512681189924,
      "learning_rate": 9.910792033348772e-07,
      "loss": 0.0001,
      "num_tokens": 26605919.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 964,
      "step_time": 18.184857320040464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 206.9375,
      "completions/mean_terminated_length": 206.9375,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.165915597230196,
      "epoch": 0.044696618805002315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00042020107503049076,
      "kl": 0.0006393089715857059,
      "learning_rate": 9.910699397869383e-07,
      "loss": 0.0,
      "num_tokens": 26643710.0,
      "reward": 0.9111884832382202,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9111884832382202,
      "rewards/reward_func/std": 0.0,
      "step": 965,
      "step_time": 23.48654007539153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 178.1875,
      "completions/mean_terminated_length": 178.1875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.39551738649606705,
      "epoch": 0.04474293654469662,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008176004630513489,
      "kl": 0.0015144433709792793,
      "learning_rate": 9.910606762389995e-07,
      "loss": 0.0001,
      "num_tokens": 26691393.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 966,
      "step_time": 24.9347990937531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 148.9375,
      "completions/mean_terminated_length": 148.9375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.35475438833236694,
      "epoch": 0.04478925428439092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008755417657084763,
      "kl": 0.00114651262992993,
      "learning_rate": 9.910514126910606e-07,
      "loss": 0.0001,
      "num_tokens": 26713344.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 967,
      "step_time": 15.805172581225634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 144.0,
      "completions/mean_terminated_length": 144.0,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.28755972534418106,
      "epoch": 0.044835572024085224,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008892468176782131,
      "kl": 0.0012161753402324393,
      "learning_rate": 9.910421491431217e-07,
      "loss": 0.0001,
      "num_tokens": 26746592.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 968,
      "step_time": 18.469312489032745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 363.0,
      "completions/max_terminated_length": 363.0,
      "completions/mean_length": 226.25,
      "completions/mean_terminated_length": 226.25,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.3481305167078972,
      "epoch": 0.04488188976377953,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08232767134904861,
      "kl": 0.001309857121668756,
      "learning_rate": 9.91032885595183e-07,
      "loss": -0.151,
      "num_tokens": 26784116.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 969,
      "step_time": 32.54627714306116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 136.4375,
      "completions/mean_terminated_length": 136.4375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.22975606471300125,
      "epoch": 0.04492820750347383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014065310824662447,
      "kl": 0.0013225088478066027,
      "learning_rate": 9.910236220472442e-07,
      "loss": 0.0001,
      "num_tokens": 26803723.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 970,
      "step_time": 15.287191644310951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 202.8125,
      "completions/mean_terminated_length": 202.8125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.3717750906944275,
      "epoch": 0.04497452524316813,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07482772320508957,
      "kl": 0.0012675386969931424,
      "learning_rate": 9.91014358499305e-07,
      "loss": 0.0792,
      "num_tokens": 26836296.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 971,
      "step_time": 24.93474406003952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 153.0625,
      "completions/mean_terminated_length": 153.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4077810049057007,
      "epoch": 0.045020842982862436,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000945600273553282,
      "kl": 0.0015428498154506087,
      "learning_rate": 9.910050949513662e-07,
      "loss": 0.0001,
      "num_tokens": 26890249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 972,
      "step_time": 24.346562299877405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 219.4375,
      "completions/mean_terminated_length": 219.4375,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.35865750908851624,
      "epoch": 0.04506716072255674,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07685404270887375,
      "kl": 0.0010975960321957245,
      "learning_rate": 9.909958314034275e-07,
      "loss": -0.029,
      "num_tokens": 26918000.0,
      "reward": 0.3125,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.3125,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 973,
      "step_time": 22.07788737118244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 180.5,
      "completions/mean_terminated_length": 180.5,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.16718735173344612,
      "epoch": 0.04511347846225104,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011189704528078437,
      "kl": 0.0008590102370362729,
      "learning_rate": 9.909865678554887e-07,
      "loss": 0.0,
      "num_tokens": 26946600.0,
      "reward": 0.6041615009307861,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6041615009307861,
      "rewards/reward_func/std": 0.0,
      "step": 974,
      "step_time": 20.742748513817787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 129.0,
      "completions/mean_terminated_length": 129.0,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2968504950404167,
      "epoch": 0.045159796201945344,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015855777310207486,
      "kl": 0.0014514246431645006,
      "learning_rate": 9.909773043075498e-07,
      "loss": 0.0001,
      "num_tokens": 26969752.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 975,
      "step_time": 14.812765996903181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 188.8125,
      "completions/mean_terminated_length": 188.8125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.22856667265295982,
      "epoch": 0.04520611394163965,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001736610196530819,
      "kl": 0.0008091626805253327,
      "learning_rate": 9.90968040759611e-07,
      "loss": 0.0,
      "num_tokens": 26994629.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 976,
      "step_time": 20.00553661584854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 212.8125,
      "completions/mean_terminated_length": 212.8125,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.2676115930080414,
      "epoch": 0.04525243168133395,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08182648569345474,
      "kl": 0.0008092873758869246,
      "learning_rate": 9.90958777211672e-07,
      "loss": -0.0264,
      "num_tokens": 27032546.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 977,
      "step_time": 26.288400877267122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.1920037753880024,
      "epoch": 0.04529874942102825,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012227949919179082,
      "kl": 0.0010539426148170605,
      "learning_rate": 9.909495136637332e-07,
      "loss": 0.0001,
      "num_tokens": 27052122.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 978,
      "step_time": 13.783868838101625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 214.5,
      "completions/mean_terminated_length": 214.5,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.16859596595168114,
      "epoch": 0.045345067160722556,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000766526150982827,
      "kl": 0.0008085589506663382,
      "learning_rate": 9.909402501157943e-07,
      "loss": 0.0,
      "num_tokens": 27080802.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 979,
      "step_time": 21.164541829377413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 321.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 250.4375,
      "completions/mean_terminated_length": 250.4375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.36518095433712006,
      "epoch": 0.04539138490041686,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.061376895755529404,
      "kl": 0.0011561915162019432,
      "learning_rate": 9.909309865678554e-07,
      "loss": 0.0109,
      "num_tokens": 27109481.0,
      "reward": 0.1178591400384903,
      "reward_std": 0.3220524787902832,
      "rewards/reward_func/mean": 0.1178591400384903,
      "rewards/reward_func/std": 0.3220525085926056,
      "step": 980,
      "step_time": 27.487699549645185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 123.5,
      "completions/mean_terminated_length": 123.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.27842877805233,
      "epoch": 0.04543770264011116,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001050975057296455,
      "kl": 0.001031560663250275,
      "learning_rate": 9.909217230199165e-07,
      "loss": 0.0001,
      "num_tokens": 27130609.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 981,
      "step_time": 13.546759389340878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 137.3125,
      "completions/mean_terminated_length": 137.3125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2758451849222183,
      "epoch": 0.045484020379805465,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010525949764996767,
      "kl": 0.0012523937621153891,
      "learning_rate": 9.909124594719779e-07,
      "loss": 0.0001,
      "num_tokens": 27152678.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 982,
      "step_time": 15.912566743791103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 188.25,
      "completions/mean_terminated_length": 188.25,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.4020918607711792,
      "epoch": 0.04553033811949977,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003653065301477909,
      "kl": 0.0019453027343843132,
      "learning_rate": 9.909031959240388e-07,
      "loss": 0.0001,
      "num_tokens": 27175626.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 983,
      "step_time": 21.362744972109795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 160.75,
      "completions/mean_terminated_length": 160.75,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3147260621190071,
      "epoch": 0.04557665585919407,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007648139144293964,
      "kl": 0.0009988356614485383,
      "learning_rate": 9.908939323761e-07,
      "loss": 0.0001,
      "num_tokens": 27198742.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 984,
      "step_time": 16.57590225711465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 126.375,
      "completions/mean_terminated_length": 126.375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2475137710571289,
      "epoch": 0.045622973598888374,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005296228919178247,
      "kl": 0.0008576173277106136,
      "learning_rate": 9.90884668828161e-07,
      "loss": 0.0,
      "num_tokens": 27219948.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 985,
      "step_time": 14.385677341371775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 120.5,
      "completions/mean_terminated_length": 120.5,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3114180639386177,
      "epoch": 0.04566929133858268,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008898326195776463,
      "kl": 0.00121376384049654,
      "learning_rate": 9.908754052802224e-07,
      "loss": 0.0001,
      "num_tokens": 27243668.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 986,
      "step_time": 14.067319616675377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 156.1875,
      "completions/mean_terminated_length": 156.1875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.28713928908109665,
      "epoch": 0.04571560907827698,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008185781189240515,
      "kl": 0.001233345188666135,
      "learning_rate": 9.908661417322835e-07,
      "loss": 0.0001,
      "num_tokens": 27263687.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 987,
      "step_time": 17.29108925536275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 152.3125,
      "completions/mean_terminated_length": 152.3125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3779827654361725,
      "epoch": 0.04576192681797128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010614685015752912,
      "kl": 0.001530019799247384,
      "learning_rate": 9.908568781843446e-07,
      "loss": 0.0001,
      "num_tokens": 27284460.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 988,
      "step_time": 15.335717637091875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 172.5,
      "completions/mean_terminated_length": 172.5,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3747268542647362,
      "epoch": 0.045808244557665585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008985947351902723,
      "kl": 0.0016214477946050465,
      "learning_rate": 9.908476146364057e-07,
      "loss": 0.0001,
      "num_tokens": 27340980.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 989,
      "step_time": 26.592960093170404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 167.125,
      "completions/mean_terminated_length": 167.125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.19721996411681175,
      "epoch": 0.04585456229735989,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005840666126459837,
      "kl": 0.0008120789570966735,
      "learning_rate": 9.908383510884669e-07,
      "loss": 0.0,
      "num_tokens": 27364886.0,
      "reward": 0.8611735105514526,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8611735105514526,
      "rewards/reward_func/std": 0.0,
      "step": 990,
      "step_time": 18.10546052083373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 183.125,
      "completions/mean_terminated_length": 183.125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.3560015484690666,
      "epoch": 0.04590088003705419,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007833911222405732,
      "kl": 0.0010475474409759045,
      "learning_rate": 9.90829087540528e-07,
      "loss": 0.0001,
      "num_tokens": 27396920.0,
      "reward": 0.2741396427154541,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.2741396427154541,
      "rewards/reward_func/std": 0.0,
      "step": 991,
      "step_time": 21.2666631154716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 167.375,
      "completions/mean_terminated_length": 167.375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.31283534318208694,
      "epoch": 0.045947197776748494,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009348664898425341,
      "kl": 0.0009910411317832768,
      "learning_rate": 9.908198239925891e-07,
      "loss": 0.0001,
      "num_tokens": 27418654.0,
      "reward": 1.5776187467508862e-08,
      "reward_std": 4.2069832062452406e-09,
      "rewards/reward_func/mean": 1.5776187467508862e-08,
      "rewards/reward_func/std": 4.2069832062452406e-09,
      "step": 992,
      "step_time": 21.00212061777711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 190.0625,
      "completions/mean_terminated_length": 190.0625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.21668926626443863,
      "epoch": 0.0459935155164428,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012792462948709726,
      "kl": 0.0011507587332744151,
      "learning_rate": 9.908105604446502e-07,
      "loss": 0.0001,
      "num_tokens": 27453263.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 993,
      "step_time": 21.143105305731297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 179.9375,
      "completions/mean_terminated_length": 179.9375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.21674254164099693,
      "epoch": 0.0460398332561371,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08749733120203018,
      "kl": 0.0011520619154907763,
      "learning_rate": 9.908012968967114e-07,
      "loss": -0.0055,
      "num_tokens": 27479342.0,
      "reward": 0.956403374671936,
      "reward_std": 0.01162576675415039,
      "rewards/reward_func/mean": 0.956403374671936,
      "rewards/reward_func/std": 0.011625767685472965,
      "step": 994,
      "step_time": 19.21072856336832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 199.9375,
      "completions/mean_terminated_length": 199.9375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.3687305301427841,
      "epoch": 0.0460861509958314,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000500780763104558,
      "kl": 0.0010712180228438228,
      "learning_rate": 9.907920333487725e-07,
      "loss": 0.0001,
      "num_tokens": 27503437.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 995,
      "step_time": 22.20991675555706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 186.1875,
      "completions/mean_terminated_length": 186.1875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.45539455860853195,
      "epoch": 0.046132468735525706,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007872936548665166,
      "kl": 0.0014461501850746572,
      "learning_rate": 9.907827698008336e-07,
      "loss": 0.0001,
      "num_tokens": 27524672.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 996,
      "step_time": 19.799541417509317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 421.0,
      "completions/max_terminated_length": 421.0,
      "completions/mean_length": 210.9375,
      "completions/mean_terminated_length": 210.9375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4072110280394554,
      "epoch": 0.04617878647522001,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06839299947023392,
      "kl": 0.002012406912399456,
      "learning_rate": 9.907735062528947e-07,
      "loss": -0.2488,
      "num_tokens": 27551599.0,
      "reward": 0.05903397873044014,
      "reward_std": 0.2361346185207367,
      "rewards/reward_func/mean": 0.05903397873044014,
      "rewards/reward_func/std": 0.2361346185207367,
      "step": 997,
      "step_time": 33.95599554479122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 372.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 320.3125,
      "completions/mean_terminated_length": 320.3125,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "entropy": 0.2276455983519554,
      "epoch": 0.04622510421491431,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05795934796333313,
      "kl": 0.0009760240791365504,
      "learning_rate": 9.907642427049559e-07,
      "loss": -0.0316,
      "num_tokens": 27592628.0,
      "reward": 0.6055164337158203,
      "reward_std": 0.21161341667175293,
      "rewards/reward_func/mean": 0.6055164337158203,
      "rewards/reward_func/std": 0.21161341667175293,
      "step": 998,
      "step_time": 33.63652973622084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.33441710472106934,
      "epoch": 0.046271421954608614,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10504307597875595,
      "kl": 0.001504827494500205,
      "learning_rate": 9.907549791570172e-07,
      "loss": 0.1282,
      "num_tokens": 27625506.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 999,
      "step_time": 25.208656802773476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 175.4375,
      "completions/mean_terminated_length": 175.4375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.188066765666008,
      "epoch": 0.04631773969430292,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06686665862798691,
      "kl": 0.0010400941246189177,
      "learning_rate": 9.907457156090783e-07,
      "loss": 0.0279,
      "num_tokens": 27650217.0,
      "reward": 0.9611176252365112,
      "reward_std": 0.0151781365275383,
      "rewards/reward_func/mean": 0.9611176252365112,
      "rewards/reward_func/std": 0.01517812255769968,
      "step": 1000,
      "step_time": 18.592824559658766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 128.0,
      "completions/mean_terminated_length": 128.0,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.15011244639754295,
      "epoch": 0.04636405743399722,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007342693861573935,
      "kl": 0.000607321817369666,
      "learning_rate": 9.907364520611392e-07,
      "loss": 0.0,
      "num_tokens": 27679433.0,
      "reward": 0.05351965129375458,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.05351965129375458,
      "rewards/reward_func/std": 0.0,
      "step": 1001,
      "step_time": 15.300740394741297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 143.375,
      "completions/mean_terminated_length": 143.375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.14754782989621162,
      "epoch": 0.04641037517369152,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000899335544090718,
      "kl": 0.0007484348898287863,
      "learning_rate": 9.907271885132004e-07,
      "loss": 0.0,
      "num_tokens": 27699823.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1002,
      "step_time": 15.886766765266657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 189.5,
      "completions/mean_terminated_length": 189.5,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.26406119018793106,
      "epoch": 0.046456692913385826,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013387624640017748,
      "kl": 0.0012153905990999192,
      "learning_rate": 9.907179249652617e-07,
      "loss": 0.0001,
      "num_tokens": 27725191.0,
      "reward": 0.8919567465782166,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8919567465782166,
      "rewards/reward_func/std": 0.0,
      "step": 1003,
      "step_time": 21.744496561586857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 200.4375,
      "completions/mean_terminated_length": 200.4375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.4516618251800537,
      "epoch": 0.04650301065308013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018766354769468307,
      "kl": 0.0014909605670254678,
      "learning_rate": 9.907086614173228e-07,
      "loss": 0.0001,
      "num_tokens": 27753150.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1004,
      "step_time": 20.778580099344254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 185.1875,
      "completions/mean_terminated_length": 185.1875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.1956096552312374,
      "epoch": 0.04654932839277443,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005770224379375577,
      "kl": 0.0007550330483354628,
      "learning_rate": 9.90699397869384e-07,
      "loss": 0.0,
      "num_tokens": 27778449.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1005,
      "step_time": 20.553741309791803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 150.5,
      "completions/mean_terminated_length": 150.5,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2911641448736191,
      "epoch": 0.046595646132468735,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016610927414149046,
      "kl": 0.001227592452778481,
      "learning_rate": 9.90690134321445e-07,
      "loss": 0.0001,
      "num_tokens": 27801849.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1006,
      "step_time": 16.10859252884984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 114.75,
      "completions/mean_terminated_length": 114.75,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3084492161870003,
      "epoch": 0.04664196387216304,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014078120002523065,
      "kl": 0.0016218001546803862,
      "learning_rate": 9.906808707735062e-07,
      "loss": 0.0001,
      "num_tokens": 27821525.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1007,
      "step_time": 13.805916965007782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 132.875,
      "completions/mean_terminated_length": 132.875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.37004653364419937,
      "epoch": 0.04668828161185734,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010392072144895792,
      "kl": 0.0016200426907744259,
      "learning_rate": 9.906716072255673e-07,
      "loss": 0.0001,
      "num_tokens": 27844563.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1008,
      "step_time": 16.00742544233799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 126.5,
      "completions/mean_terminated_length": 126.5,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2602906823158264,
      "epoch": 0.046734599351551644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007567908032797277,
      "kl": 0.0009213285811711103,
      "learning_rate": 9.906623436776285e-07,
      "loss": 0.0,
      "num_tokens": 27865195.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1009,
      "step_time": 13.58338475972414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 261.5,
      "completions/mean_terminated_length": 261.5,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "entropy": 0.26458117365837097,
      "epoch": 0.04678091709124595,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.054729245603084564,
      "kl": 0.0008141873258864507,
      "learning_rate": 9.906530801296896e-07,
      "loss": -0.0347,
      "num_tokens": 27899251.0,
      "reward": 0.5853970050811768,
      "reward_std": 0.007496384438127279,
      "rewards/reward_func/mean": 0.5853970050811768,
      "rewards/reward_func/std": 0.007496391888707876,
      "step": 1010,
      "step_time": 26.771377734839916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 191.5,
      "completions/mean_terminated_length": 191.5,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2727834992110729,
      "epoch": 0.04682723483094025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07664250582456589,
      "kl": 0.0009567018860252574,
      "learning_rate": 9.906438165817507e-07,
      "loss": -0.0267,
      "num_tokens": 27931115.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1011,
      "step_time": 21.531744547188282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 144.5625,
      "completions/mean_terminated_length": 144.5625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.24290068447589874,
      "epoch": 0.04687355257063455,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000837481755297631,
      "kl": 0.0010146870190510526,
      "learning_rate": 9.90634553033812e-07,
      "loss": 0.0001,
      "num_tokens": 27951620.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1012,
      "step_time": 15.088049869984388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 209.75,
      "completions/mean_terminated_length": 209.75,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.3999406322836876,
      "epoch": 0.046919870310328855,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07669055461883545,
      "kl": 0.001167811220511794,
      "learning_rate": 9.906252894858732e-07,
      "loss": -0.2254,
      "num_tokens": 27991568.0,
      "reward": 0.04979052022099495,
      "reward_std": 0.1991620808839798,
      "rewards/reward_func/mean": 0.04979052022099495,
      "rewards/reward_func/std": 0.199162095785141,
      "step": 1013,
      "step_time": 35.830657087266445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 172.4375,
      "completions/mean_terminated_length": 172.4375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.36374586820602417,
      "epoch": 0.04696618805002316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00498656602576375,
      "kl": 0.0029439253848977387,
      "learning_rate": 9.90616025937934e-07,
      "loss": 0.0001,
      "num_tokens": 28052807.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1014,
      "step_time": 28.638108514249325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 140.8125,
      "completions/mean_terminated_length": 140.8125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3514583930373192,
      "epoch": 0.04701250578971746,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012915709521621466,
      "kl": 0.0011993466614512727,
      "learning_rate": 9.906067623899952e-07,
      "loss": 0.0001,
      "num_tokens": 28078228.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1015,
      "step_time": 16.573318760842085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 116.875,
      "completions/mean_terminated_length": 116.875,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 0.3068011477589607,
      "epoch": 0.047058823529411764,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025617980863898993,
      "kl": 0.0018665404641069472,
      "learning_rate": 9.905974988420565e-07,
      "loss": 0.0001,
      "num_tokens": 28099010.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1016,
      "step_time": 14.626772541552782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 121.6875,
      "completions/mean_terminated_length": 121.6875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.27892550826072693,
      "epoch": 0.04710514126910607,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015984615311026573,
      "kl": 0.0014860667288303375,
      "learning_rate": 9.905882352941177e-07,
      "loss": 0.0001,
      "num_tokens": 28123037.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1017,
      "step_time": 15.03841832652688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 129.25,
      "completions/mean_terminated_length": 129.25,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2556873820722103,
      "epoch": 0.04715145900880037,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001144444802775979,
      "kl": 0.0011821803927887231,
      "learning_rate": 9.905789717461788e-07,
      "loss": 0.0001,
      "num_tokens": 28142769.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1018,
      "step_time": 13.272651929408312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 179.1875,
      "completions/mean_terminated_length": 179.1875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.32374632358551025,
      "epoch": 0.04719777674849467,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07296735793352127,
      "kl": 0.0012415140372468159,
      "learning_rate": 9.9056970819824e-07,
      "loss": 0.0116,
      "num_tokens": 28171908.0,
      "reward": 0.858630895614624,
      "reward_std": 0.23279428482055664,
      "rewards/reward_func/mean": 0.858630895614624,
      "rewards/reward_func/std": 0.23279428482055664,
      "step": 1019,
      "step_time": 22.157922506332397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 184.0625,
      "completions/mean_terminated_length": 184.0625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3561431095004082,
      "epoch": 0.047244094488188976,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14152806997299194,
      "kl": 0.0031695798970758915,
      "learning_rate": 9.90560444650301e-07,
      "loss": -0.0217,
      "num_tokens": 28217685.0,
      "reward": 0.3959382176399231,
      "reward_std": 0.46706902980804443,
      "rewards/reward_func/mean": 0.3959382176399231,
      "rewards/reward_func/std": 0.4670690596103668,
      "step": 1020,
      "step_time": 25.170372180640697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 176.0,
      "completions/mean_terminated_length": 176.0,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.37993478775024414,
      "epoch": 0.04729041222788328,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018244581297039986,
      "kl": 0.0018740384257398546,
      "learning_rate": 9.905511811023622e-07,
      "loss": 0.0001,
      "num_tokens": 28241669.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1021,
      "step_time": 20.334804717451334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 131.25,
      "completions/mean_terminated_length": 131.25,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2585686296224594,
      "epoch": 0.04733672996757758,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021120314486324787,
      "kl": 0.0018503092287573963,
      "learning_rate": 9.905419175544233e-07,
      "loss": 0.0001,
      "num_tokens": 28264361.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1022,
      "step_time": 14.707961484789848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 167.6875,
      "completions/mean_terminated_length": 167.6875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.27028487622737885,
      "epoch": 0.047383047707271884,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09869460761547089,
      "kl": 0.0010344291804358363,
      "learning_rate": 9.905326540064844e-07,
      "loss": 0.013,
      "num_tokens": 28286164.0,
      "reward": 0.9014118909835815,
      "reward_std": 0.02629014663398266,
      "rewards/reward_func/mean": 0.9014118909835815,
      "rewards/reward_func/std": 0.026290163397789,
      "step": 1023,
      "step_time": 17.112298902124166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 193.9375,
      "completions/mean_terminated_length": 193.9375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.20764760300517082,
      "epoch": 0.04742936544696619,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011982873547822237,
      "kl": 0.0012532433029264212,
      "learning_rate": 9.905233904585455e-07,
      "loss": 0.0001,
      "num_tokens": 28340467.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 1024,
      "step_time": 27.57717900723219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 133.625,
      "completions/mean_terminated_length": 133.625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.30567528307437897,
      "epoch": 0.04747568318666049,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009536018478684127,
      "kl": 0.0010554982582107186,
      "learning_rate": 9.905141269106067e-07,
      "loss": 0.0001,
      "num_tokens": 28363021.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1025,
      "step_time": 14.877538722008467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 166.1875,
      "completions/mean_terminated_length": 166.1875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.23596811294555664,
      "epoch": 0.04752200092635479,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008968862821348011,
      "kl": 0.0010560166410868987,
      "learning_rate": 9.905048633626678e-07,
      "loss": 0.0001,
      "num_tokens": 28384080.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 1026,
      "step_time": 17.550704695284367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 194.3125,
      "completions/mean_terminated_length": 194.3125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.18419504538178444,
      "epoch": 0.047568318666049096,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006631119758822024,
      "kl": 0.0007572467438876629,
      "learning_rate": 9.90495599814729e-07,
      "loss": 0.0,
      "num_tokens": 28412421.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1027,
      "step_time": 20.707427095621824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 230.375,
      "completions/mean_terminated_length": 230.375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.3973093628883362,
      "epoch": 0.0476146364057434,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07777100056409836,
      "kl": 0.0015067631611600518,
      "learning_rate": 9.9048633626679e-07,
      "loss": 0.0644,
      "num_tokens": 28438235.0,
      "reward": 0.04411235451698303,
      "reward_std": 0.026303526014089584,
      "rewards/reward_func/mean": 0.04411235451698303,
      "rewards/reward_func/std": 0.026303526014089584,
      "step": 1028,
      "step_time": 25.296633563935757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 129.9375,
      "completions/mean_terminated_length": 129.9375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3399771451950073,
      "epoch": 0.0476609541454377,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014560186536982656,
      "kl": 0.0016204967396333814,
      "learning_rate": 9.904770727188514e-07,
      "loss": 0.0001,
      "num_tokens": 28458394.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1029,
      "step_time": 14.084137599915266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 184.8125,
      "completions/mean_terminated_length": 184.8125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.2464139610528946,
      "epoch": 0.047707271885132005,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008261942421086133,
      "kl": 0.0009879043354885653,
      "learning_rate": 9.904678091709125e-07,
      "loss": 0.0,
      "num_tokens": 28479399.0,
      "reward": 0.6376281380653381,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6376281380653381,
      "rewards/reward_func/std": 0.0,
      "step": 1030,
      "step_time": 18.75266282632947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 176.6875,
      "completions/mean_terminated_length": 176.6875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3958509489893913,
      "epoch": 0.04775358962482631,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09384248405694962,
      "kl": 0.0012421664432622492,
      "learning_rate": 9.904585456229736e-07,
      "loss": -0.0399,
      "num_tokens": 28505778.0,
      "reward": 0.011110535822808743,
      "reward_std": 0.04444214329123497,
      "rewards/reward_func/mean": 0.011110535822808743,
      "rewards/reward_func/std": 0.04444214701652527,
      "step": 1031,
      "step_time": 21.056372981518507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 125.0625,
      "completions/mean_terminated_length": 125.0625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.29521749913692474,
      "epoch": 0.04779990736452061,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011238530278205872,
      "kl": 0.0012431179638952017,
      "learning_rate": 9.904492820750345e-07,
      "loss": 0.0001,
      "num_tokens": 28528259.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1032,
      "step_time": 14.588222738355398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 114.5625,
      "completions/mean_terminated_length": 114.5625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.28873495757579803,
      "epoch": 0.047846225104214914,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013598831137642264,
      "kl": 0.001216363481944427,
      "learning_rate": 9.904400185270959e-07,
      "loss": 0.0001,
      "num_tokens": 28553868.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1033,
      "step_time": 13.451530616730452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 201.0,
      "completions/mean_terminated_length": 201.0,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.30545124411582947,
      "epoch": 0.04789254284390922,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08109713345766068,
      "kl": 0.0015163577045314014,
      "learning_rate": 9.90430754979157e-07,
      "loss": -0.0362,
      "num_tokens": 28577468.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1034,
      "step_time": 21.60637979581952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 288.5625,
      "completions/mean_terminated_length": 288.5625,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "entropy": 0.23269639909267426,
      "epoch": 0.04793886058360352,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.076987624168396,
      "kl": 0.0011784299713326618,
      "learning_rate": 9.904214914312181e-07,
      "loss": -0.1046,
      "num_tokens": 28617637.0,
      "reward": 0.6312336921691895,
      "reward_std": 0.4019744098186493,
      "rewards/reward_func/mean": 0.6312336921691895,
      "rewards/reward_func/std": 0.4019744396209717,
      "step": 1035,
      "step_time": 33.45261598005891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.20818743482232094,
      "epoch": 0.04798517832329782,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011204793117940426,
      "kl": 0.0012418660335242748,
      "learning_rate": 9.904122278832793e-07,
      "loss": 0.0001,
      "num_tokens": 28638248.0,
      "reward": 0.780767560005188,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.780767560005188,
      "rewards/reward_func/std": 0.0,
      "step": 1036,
      "step_time": 14.784468349069357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 127.0625,
      "completions/mean_terminated_length": 127.0625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3053184598684311,
      "epoch": 0.048031496062992125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002906875452026725,
      "kl": 0.0019339821301400661,
      "learning_rate": 9.904029643353404e-07,
      "loss": 0.0001,
      "num_tokens": 28668953.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1037,
      "step_time": 16.426546167582273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 130.6875,
      "completions/mean_terminated_length": 130.6875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.34770015627145767,
      "epoch": 0.04807781380268643,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013411898398771882,
      "kl": 0.0017382468504365534,
      "learning_rate": 9.903937007874015e-07,
      "loss": 0.0001,
      "num_tokens": 28689332.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1038,
      "step_time": 14.812857542186975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 113.8125,
      "completions/mean_terminated_length": 113.8125,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2688029557466507,
      "epoch": 0.04812413154238073,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001557044917717576,
      "kl": 0.001222449413035065,
      "learning_rate": 9.903844372394626e-07,
      "loss": 0.0001,
      "num_tokens": 28712033.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1039,
      "step_time": 14.10106448084116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 167.8125,
      "completions/mean_terminated_length": 167.8125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4481164366006851,
      "epoch": 0.048170449282075034,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001968338154256344,
      "kl": 0.0017540619592182338,
      "learning_rate": 9.903751736915238e-07,
      "loss": 0.0001,
      "num_tokens": 28734542.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1040,
      "step_time": 17.662689447402954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 141.4375,
      "completions/mean_terminated_length": 141.4375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2544493079185486,
      "epoch": 0.04821676702176934,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005187811329960823,
      "kl": 0.0017500853282399476,
      "learning_rate": 9.903659101435849e-07,
      "loss": 0.0001,
      "num_tokens": 28758149.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1041,
      "step_time": 15.444901376962662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 211.25,
      "completions/mean_terminated_length": 211.25,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.24108736217021942,
      "epoch": 0.04826308476146364,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018843008438125253,
      "kl": 0.0013402788899838924,
      "learning_rate": 9.90356646595646e-07,
      "loss": 0.0001,
      "num_tokens": 28786537.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1042,
      "step_time": 22.423270910978317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 111.375,
      "completions/mean_terminated_length": 111.375,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.3065830022096634,
      "epoch": 0.04830940250115794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014831394655629992,
      "kl": 0.0015106158389244229,
      "learning_rate": 9.903473830477073e-07,
      "loss": 0.0001,
      "num_tokens": 28806847.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1043,
      "step_time": 13.123977195471525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 129.625,
      "completions/mean_terminated_length": 129.625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.29124607890844345,
      "epoch": 0.048355720240852246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007524856482632458,
      "kl": 0.0010806052305269986,
      "learning_rate": 9.903381194997685e-07,
      "loss": 0.0001,
      "num_tokens": 28829545.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1044,
      "step_time": 14.699601989239454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 121.5,
      "completions/mean_terminated_length": 121.5,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2185620404779911,
      "epoch": 0.04840203798054655,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001890899264253676,
      "kl": 0.0011860131926368922,
      "learning_rate": 9.903288559518294e-07,
      "loss": 0.0001,
      "num_tokens": 28849185.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1045,
      "step_time": 13.41104994341731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 209.6875,
      "completions/mean_terminated_length": 209.6875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3234374299645424,
      "epoch": 0.04844835572024085,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0852591022849083,
      "kl": 0.001257849100511521,
      "learning_rate": 9.903195924038907e-07,
      "loss": -0.1827,
      "num_tokens": 28888236.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 1046,
      "step_time": 28.974656738340855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.377143032848835,
      "epoch": 0.048494673459935154,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013827767688781023,
      "kl": 0.0014107885072007775,
      "learning_rate": 9.903103288559518e-07,
      "loss": 0.0001,
      "num_tokens": 28913528.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1047,
      "step_time": 20.787018537521362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 188.1875,
      "completions/mean_terminated_length": 188.1875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3820813000202179,
      "epoch": 0.04854099119962946,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09522230923175812,
      "kl": 0.0011572672810871154,
      "learning_rate": 9.90301065308013e-07,
      "loss": 0.0279,
      "num_tokens": 28942011.0,
      "reward": 0.7721847891807556,
      "reward_std": 0.3014300763607025,
      "rewards/reward_func/mean": 0.7721847891807556,
      "rewards/reward_func/std": 0.3014300763607025,
      "step": 1048,
      "step_time": 24.79846828058362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 156.1875,
      "completions/mean_terminated_length": 156.1875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2822834402322769,
      "epoch": 0.04858730893932376,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010766360210254788,
      "kl": 0.0011587667395360768,
      "learning_rate": 9.90291801760074e-07,
      "loss": 0.0001,
      "num_tokens": 28974830.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1049,
      "step_time": 19.522984847426414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 121.3125,
      "completions/mean_terminated_length": 121.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3349555507302284,
      "epoch": 0.04863362667901806,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018369510071352124,
      "kl": 0.0017125543090514839,
      "learning_rate": 9.902825382121352e-07,
      "loss": 0.0001,
      "num_tokens": 28995667.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1050,
      "step_time": 12.765341181308031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.75,
      "completions/mean_terminated_length": 132.75,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2551494538784027,
      "epoch": 0.048679944418712366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015950956149026752,
      "kl": 0.0009169582335744053,
      "learning_rate": 9.902732746641963e-07,
      "loss": 0.0,
      "num_tokens": 29024271.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1051,
      "step_time": 15.948854491114616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 228.0625,
      "completions/mean_terminated_length": 228.0625,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.40965036302804947,
      "epoch": 0.04872626215840667,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06601638346910477,
      "kl": 0.0010745679610408843,
      "learning_rate": 9.902640111162575e-07,
      "loss": -0.0452,
      "num_tokens": 29062816.0,
      "reward": 0.22259801626205444,
      "reward_std": 0.1388283222913742,
      "rewards/reward_func/mean": 0.22259801626205444,
      "rewards/reward_func/std": 0.1388283371925354,
      "step": 1052,
      "step_time": 26.456897154450417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 160.0,
      "completions/mean_terminated_length": 160.0,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.17067742720246315,
      "epoch": 0.04877257989810097,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08437284082174301,
      "kl": 0.001013905493891798,
      "learning_rate": 9.902547475683186e-07,
      "loss": -0.0323,
      "num_tokens": 29083904.0,
      "reward": 0.1774260252714157,
      "reward_std": 0.00568058155477047,
      "rewards/reward_func/mean": 0.1774260252714157,
      "rewards/reward_func/std": 0.005680582020431757,
      "step": 1053,
      "step_time": 16.477448847144842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 146.125,
      "completions/mean_terminated_length": 146.125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3493001386523247,
      "epoch": 0.048818897637795275,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017707743681967258,
      "kl": 0.002003938687266782,
      "learning_rate": 9.902454840203797e-07,
      "loss": 0.0001,
      "num_tokens": 29106786.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1054,
      "step_time": 16.477712485939264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 413.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 371.75,
      "completions/mean_terminated_length": 371.75,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "entropy": 0.16637060418725014,
      "epoch": 0.04886521537748958,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03795616701245308,
      "kl": 0.0005525052256416529,
      "learning_rate": 9.902362204724408e-07,
      "loss": -0.0019,
      "num_tokens": 29135694.0,
      "reward": 0.9831075072288513,
      "reward_std": 0.017591135576367378,
      "rewards/reward_func/mean": 0.9831075072288513,
      "rewards/reward_func/std": 0.017591137439012527,
      "step": 1055,
      "step_time": 33.35343899577856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.31635092198848724,
      "epoch": 0.04891153311718388,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012206325773149729,
      "kl": 0.001716562604997307,
      "learning_rate": 9.902269569245022e-07,
      "loss": 0.0001,
      "num_tokens": 29163638.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1056,
      "step_time": 15.829910147935152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 147.3125,
      "completions/mean_terminated_length": 147.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3439546301960945,
      "epoch": 0.048957850856878184,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14148610830307007,
      "kl": 0.0021906031761318445,
      "learning_rate": 9.90217693376563e-07,
      "loss": 0.0021,
      "num_tokens": 29188555.0,
      "reward": 0.04598493129014969,
      "reward_std": 0.12565475702285767,
      "rewards/reward_func/mean": 0.04598493129014969,
      "rewards/reward_func/std": 0.12565475702285767,
      "step": 1057,
      "step_time": 16.40659347549081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 169.625,
      "completions/mean_terminated_length": 169.625,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3805572837591171,
      "epoch": 0.04900416859657249,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009291348978877068,
      "kl": 0.0014806788240093738,
      "learning_rate": 9.902084298286242e-07,
      "loss": 0.0001,
      "num_tokens": 29211077.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1058,
      "step_time": 17.243373408913612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 167.9375,
      "completions/mean_terminated_length": 167.9375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3104632496833801,
      "epoch": 0.04905048633626679,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008697710582055151,
      "kl": 0.0010451382404426113,
      "learning_rate": 9.901991662806855e-07,
      "loss": 0.0001,
      "num_tokens": 29233732.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1059,
      "step_time": 17.982558369636536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 135.0,
      "completions/mean_terminated_length": 135.0,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.341328501701355,
      "epoch": 0.04909680407596109,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008639628649689257,
      "kl": 0.001301385651458986,
      "learning_rate": 9.901899027327467e-07,
      "loss": 0.0001,
      "num_tokens": 29269716.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1060,
      "step_time": 18.138447511941195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 135.25,
      "completions/mean_terminated_length": 135.25,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2975976839661598,
      "epoch": 0.049143121815655395,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008323938236571848,
      "kl": 0.001154248500824906,
      "learning_rate": 9.901806391848078e-07,
      "loss": 0.0001,
      "num_tokens": 29293592.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1061,
      "step_time": 16.84081495180726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 191.0625,
      "completions/mean_terminated_length": 191.0625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.21305131912231445,
      "epoch": 0.0491894395553497,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07748810946941376,
      "kl": 0.0010197410738328472,
      "learning_rate": 9.90171375636869e-07,
      "loss": 0.0379,
      "num_tokens": 29318441.0,
      "reward": 0.9566360712051392,
      "reward_std": 0.039497580379247665,
      "rewards/reward_func/mean": 0.9566360712051392,
      "rewards/reward_func/std": 0.03949758782982826,
      "step": 1062,
      "step_time": 19.56187452748418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 188.125,
      "completions/mean_terminated_length": 188.125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.1875189207494259,
      "epoch": 0.049235757295044,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009874739916995168,
      "kl": 0.0007780540472595021,
      "learning_rate": 9.9016211208893e-07,
      "loss": 0.0,
      "num_tokens": 29341595.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1063,
      "step_time": 19.514124918729067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 110.4375,
      "completions/mean_terminated_length": 110.4375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.28170251101255417,
      "epoch": 0.049282075034738304,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002179292729124427,
      "kl": 0.0021450460189953446,
      "learning_rate": 9.901528485409912e-07,
      "loss": 0.0001,
      "num_tokens": 29361986.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1064,
      "step_time": 13.229779623448849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 193.75,
      "completions/mean_terminated_length": 193.75,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.25674911588430405,
      "epoch": 0.04932839277443261,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008665647474117577,
      "kl": 0.0011798219056800008,
      "learning_rate": 9.901435849930523e-07,
      "loss": 0.0001,
      "num_tokens": 29394046.0,
      "reward": 0.5623413324356079,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5623413324356079,
      "rewards/reward_func/std": 0.0,
      "step": 1065,
      "step_time": 21.710856899619102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 150.6875,
      "completions/mean_terminated_length": 150.6875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2926267385482788,
      "epoch": 0.04937471051412691,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008478787494823337,
      "kl": 0.001008578998153098,
      "learning_rate": 9.901343214451134e-07,
      "loss": 0.0001,
      "num_tokens": 29414441.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1066,
      "step_time": 16.394194394350052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 149.5625,
      "completions/mean_terminated_length": 149.5625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3093181997537613,
      "epoch": 0.04942102825382121,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009809837210923433,
      "kl": 0.0011437874200055376,
      "learning_rate": 9.901250578971745e-07,
      "loss": 0.0001,
      "num_tokens": 29449090.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1067,
      "step_time": 20.499243050813675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 132.4375,
      "completions/mean_terminated_length": 132.4375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2840966731309891,
      "epoch": 0.049467345993515516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014046216383576393,
      "kl": 0.00144971240661107,
      "learning_rate": 9.901157943492357e-07,
      "loss": 0.0001,
      "num_tokens": 29481625.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1068,
      "step_time": 16.62565889954567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 191.625,
      "completions/mean_terminated_length": 191.625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.41206422448158264,
      "epoch": 0.04951366373320982,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012185772648081183,
      "kl": 0.0015064417966641486,
      "learning_rate": 9.901065308012968e-07,
      "loss": 0.0001,
      "num_tokens": 29510867.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1069,
      "step_time": 21.32372997328639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 132.5,
      "completions/mean_terminated_length": 132.5,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2584761790931225,
      "epoch": 0.04955998147290412,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008137408876791596,
      "kl": 0.0009865954925771803,
      "learning_rate": 9.90097267253358e-07,
      "loss": 0.0,
      "num_tokens": 29533003.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1070,
      "step_time": 14.583113599568605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 143.375,
      "completions/mean_terminated_length": 143.375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3181304261088371,
      "epoch": 0.049606299212598425,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000891090021468699,
      "kl": 0.00127486334531568,
      "learning_rate": 9.90088003705419e-07,
      "loss": 0.0001,
      "num_tokens": 29555345.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1071,
      "step_time": 15.791466876864433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 184.0625,
      "completions/mean_terminated_length": 184.0625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2723410539329052,
      "epoch": 0.04965261695229273,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08994110673666,
      "kl": 0.0010682634310796857,
      "learning_rate": 9.900787401574802e-07,
      "loss": -0.092,
      "num_tokens": 29581058.0,
      "reward": 0.8752042055130005,
      "reward_std": 0.33016297221183777,
      "rewards/reward_func/mean": 0.8752042055130005,
      "rewards/reward_func/std": 0.33016300201416016,
      "step": 1072,
      "step_time": 21.679789248853922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 141.3125,
      "completions/mean_terminated_length": 141.3125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3139200508594513,
      "epoch": 0.04969893469198703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014281471958383918,
      "kl": 0.0011057298397645354,
      "learning_rate": 9.900694766095415e-07,
      "loss": 0.0001,
      "num_tokens": 29608423.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1073,
      "step_time": 16.696265920996666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 356.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 323.75,
      "completions/mean_terminated_length": 323.75,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "entropy": 0.2060050554573536,
      "epoch": 0.04974525243168133,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.058326784521341324,
      "kl": 0.0007712448859820142,
      "learning_rate": 9.900602130616026e-07,
      "loss": -0.032,
      "num_tokens": 29636355.0,
      "reward": 0.8223556280136108,
      "reward_std": 0.014419873245060444,
      "rewards/reward_func/mean": 0.8223556280136108,
      "rewards/reward_func/std": 0.014419869519770145,
      "step": 1074,
      "step_time": 29.50220875069499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 169.875,
      "completions/mean_terminated_length": 169.875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3809701278805733,
      "epoch": 0.049791570171375636,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016130228759720922,
      "kl": 0.001457754144212231,
      "learning_rate": 9.900509495136635e-07,
      "loss": 0.0001,
      "num_tokens": 29658593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1075,
      "step_time": 19.350709948688745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 149.75,
      "completions/mean_terminated_length": 149.75,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.33714165538549423,
      "epoch": 0.04983788791106994,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016034268774092197,
      "kl": 0.0013555600889958441,
      "learning_rate": 9.900416859657249e-07,
      "loss": 0.0001,
      "num_tokens": 29683661.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1076,
      "step_time": 16.41102172806859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 156.375,
      "completions/mean_terminated_length": 156.375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.31672170758247375,
      "epoch": 0.04988420565076424,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008252878906205297,
      "kl": 0.0012895289110019803,
      "learning_rate": 9.90032422417786e-07,
      "loss": 0.0001,
      "num_tokens": 29705955.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1077,
      "step_time": 16.75003569200635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 247.3125,
      "completions/mean_terminated_length": 247.3125,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "entropy": 0.2997872605919838,
      "epoch": 0.049930523390458545,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07149261981248856,
      "kl": 0.0010818528244271874,
      "learning_rate": 9.900231588698471e-07,
      "loss": 0.024,
      "num_tokens": 29744312.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1078,
      "step_time": 28.95836164802313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 133.5625,
      "completions/mean_terminated_length": 133.5625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2362428493797779,
      "epoch": 0.04997684113015285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010859938338398933,
      "kl": 0.0012356432271189988,
      "learning_rate": 9.900138953219083e-07,
      "loss": 0.0001,
      "num_tokens": 29763953.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1079,
      "step_time": 13.911317389458418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 172.375,
      "completions/mean_terminated_length": 172.375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3578645661473274,
      "epoch": 0.05002315886984715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09514566510915756,
      "kl": 0.0016648909368086606,
      "learning_rate": 9.900046317739694e-07,
      "loss": 0.1251,
      "num_tokens": 29784487.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 1080,
      "step_time": 19.114783979952335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 161.8125,
      "completions/mean_terminated_length": 161.8125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3330864980816841,
      "epoch": 0.050069476609541454,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09150286763906479,
      "kl": 0.0010565592092461884,
      "learning_rate": 9.899953682260305e-07,
      "loss": -0.0914,
      "num_tokens": 29804804.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 1081,
      "step_time": 20.501433834433556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 298.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 249.6875,
      "completions/mean_terminated_length": 249.6875,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.37190791219472885,
      "epoch": 0.05011579434923576,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07680151611566544,
      "kl": 0.0011202323366887867,
      "learning_rate": 9.899861046780916e-07,
      "loss": -0.1132,
      "num_tokens": 29828783.0,
      "reward": 0.5919939875602722,
      "reward_std": 0.4850125014781952,
      "rewards/reward_func/mean": 0.5919939875602722,
      "rewards/reward_func/std": 0.4850125312805176,
      "step": 1082,
      "step_time": 24.543287433683872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 200.9375,
      "completions/mean_terminated_length": 200.9375,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.19050583988428116,
      "epoch": 0.05016211208893006,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0795620009303093,
      "kl": 0.0007946142286527902,
      "learning_rate": 9.899768411301528e-07,
      "loss": 0.058,
      "num_tokens": 29852190.0,
      "reward": 0.70708829164505,
      "reward_std": 0.010605335235595703,
      "rewards/reward_func/mean": 0.70708829164505,
      "rewards/reward_func/std": 0.010605335235595703,
      "step": 1083,
      "step_time": 22.166945844888687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 155.4375,
      "completions/mean_terminated_length": 155.4375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.19904791936278343,
      "epoch": 0.05020842982862436,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10177665203809738,
      "kl": 0.0013613034825539216,
      "learning_rate": 9.899675775822139e-07,
      "loss": 0.0314,
      "num_tokens": 29874261.0,
      "reward": 0.8440724611282349,
      "reward_std": 0.05945207178592682,
      "rewards/reward_func/mean": 0.8440724611282349,
      "rewards/reward_func/std": 0.05945207178592682,
      "step": 1084,
      "step_time": 16.072620674967766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 243.1875,
      "completions/mean_terminated_length": 243.1875,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "entropy": 0.2188308946788311,
      "epoch": 0.050254747568318665,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08357269316911697,
      "kl": 0.0009986345248762518,
      "learning_rate": 9.89958314034275e-07,
      "loss": -0.0593,
      "num_tokens": 29898312.0,
      "reward": 0.5324225425720215,
      "reward_std": 0.12115535885095596,
      "rewards/reward_func/mean": 0.5324225425720215,
      "rewards/reward_func/std": 0.12115536630153656,
      "step": 1085,
      "step_time": 25.0636737793684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 171.8125,
      "completions/mean_terminated_length": 171.8125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.21330950036644936,
      "epoch": 0.05030106530801297,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08567479997873306,
      "kl": 0.0011195143160875887,
      "learning_rate": 9.899490504863363e-07,
      "loss": 0.0049,
      "num_tokens": 29926453.0,
      "reward": 0.9535844326019287,
      "reward_std": 0.023028584197163582,
      "rewards/reward_func/mean": 0.9535844326019287,
      "rewards/reward_func/std": 0.023028582334518433,
      "step": 1086,
      "step_time": 19.37694550305605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 124.875,
      "completions/mean_terminated_length": 124.875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.33746711909770966,
      "epoch": 0.05034738304770727,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013701335992664099,
      "kl": 0.001653151965001598,
      "learning_rate": 9.899397869383975e-07,
      "loss": 0.0001,
      "num_tokens": 29946179.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1087,
      "step_time": 14.556724030524492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 188.125,
      "completions/mean_terminated_length": 188.125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.2678183689713478,
      "epoch": 0.050393700787401574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07312487810850143,
      "kl": 0.0011790625285357237,
      "learning_rate": 9.899305233904584e-07,
      "loss": -0.0268,
      "num_tokens": 29968917.0,
      "reward": 0.9347172975540161,
      "reward_std": 0.025483757257461548,
      "rewards/reward_func/mean": 0.9347172975540161,
      "rewards/reward_func/std": 0.025483759120106697,
      "step": 1088,
      "step_time": 18.529357075691223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 189.9375,
      "completions/mean_terminated_length": 189.9375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2221939153969288,
      "epoch": 0.05044001852709588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09114961326122284,
      "kl": 0.0011734712170436978,
      "learning_rate": 9.899212598425197e-07,
      "loss": -0.0636,
      "num_tokens": 29991316.0,
      "reward": 0.6862379312515259,
      "reward_std": 0.4361012578010559,
      "rewards/reward_func/mean": 0.6862379312515259,
      "rewards/reward_func/std": 0.4361012578010559,
      "step": 1089,
      "step_time": 21.20943710207939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 163.4375,
      "completions/mean_terminated_length": 163.4375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.38223615288734436,
      "epoch": 0.05048633626679018,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023525089491158724,
      "kl": 0.002194980625063181,
      "learning_rate": 9.899119962945808e-07,
      "loss": 0.0001,
      "num_tokens": 30044427.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1090,
      "step_time": 23.972115632146597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 146.375,
      "completions/mean_terminated_length": 146.375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2931435704231262,
      "epoch": 0.05053265400648448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00117598962970078,
      "kl": 0.0012452219962142408,
      "learning_rate": 9.89902732746642e-07,
      "loss": 0.0001,
      "num_tokens": 30068017.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1091,
      "step_time": 16.37996008247137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 123.8125,
      "completions/mean_terminated_length": 123.8125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.27399730682373047,
      "epoch": 0.050578971746178786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009294325718656182,
      "kl": 0.0011050984903704375,
      "learning_rate": 9.89893469198703e-07,
      "loss": 0.0001,
      "num_tokens": 30089918.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1092,
      "step_time": 13.417639058083296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 200.3125,
      "completions/mean_terminated_length": 200.3125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.2586069777607918,
      "epoch": 0.05062528948587309,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011547214817255735,
      "kl": 0.0011535890225786716,
      "learning_rate": 9.898842056507642e-07,
      "loss": 0.0001,
      "num_tokens": 30116259.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1093,
      "step_time": 21.74313473328948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 172.25,
      "completions/mean_terminated_length": 172.25,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.38282404094934464,
      "epoch": 0.05067160722556739,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014855681220069528,
      "kl": 0.0015927240310702473,
      "learning_rate": 9.898749421028253e-07,
      "loss": 0.0001,
      "num_tokens": 30141159.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1094,
      "step_time": 18.784075524657965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 124.9375,
      "completions/mean_terminated_length": 124.9375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2513014040887356,
      "epoch": 0.050717924965261695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014614450046792626,
      "kl": 0.0014210605586413294,
      "learning_rate": 9.898656785548865e-07,
      "loss": 0.0001,
      "num_tokens": 30160550.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1095,
      "step_time": 14.311907079070807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 247.6875,
      "completions/mean_terminated_length": 247.6875,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "entropy": 0.31304802745580673,
      "epoch": 0.050764242704956,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07821127772331238,
      "kl": 0.0014542262069880962,
      "learning_rate": 9.898564150069476e-07,
      "loss": -0.0658,
      "num_tokens": 30198913.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 1096,
      "step_time": 27.757250882685184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 173.625,
      "completions/mean_terminated_length": 173.625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.33365585654973984,
      "epoch": 0.0508105604446503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007472095894627273,
      "kl": 0.0012242514931131154,
      "learning_rate": 9.898471514590087e-07,
      "loss": 0.0001,
      "num_tokens": 30230907.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1097,
      "step_time": 20.173478361219168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 161.9375,
      "completions/mean_terminated_length": 161.9375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.40793028473854065,
      "epoch": 0.0508568781843446,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021767800208181143,
      "kl": 0.001700811495538801,
      "learning_rate": 9.898378879110698e-07,
      "loss": 0.0001,
      "num_tokens": 30258490.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1098,
      "step_time": 20.168668530881405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 178.875,
      "completions/mean_terminated_length": 178.875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.25799454003572464,
      "epoch": 0.050903195924038906,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09027945250272751,
      "kl": 0.0012264355609659106,
      "learning_rate": 9.898286243631312e-07,
      "loss": -0.0359,
      "num_tokens": 30283272.0,
      "reward": 0.8811430931091309,
      "reward_std": 0.05903739482164383,
      "rewards/reward_func/mean": 0.8811430931091309,
      "rewards/reward_func/std": 0.05903739109635353,
      "step": 1099,
      "step_time": 18.734569620341063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 132.9375,
      "completions/mean_terminated_length": 132.9375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3116462081670761,
      "epoch": 0.05094951366373321,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014807460829615593,
      "kl": 0.001511095353635028,
      "learning_rate": 9.89819360815192e-07,
      "loss": 0.0001,
      "num_tokens": 30306567.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1100,
      "step_time": 14.597326949238777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 182.3125,
      "completions/mean_terminated_length": 182.3125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.2798741087317467,
      "epoch": 0.05099583140342751,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07083644717931747,
      "kl": 0.0011947660677833483,
      "learning_rate": 9.898100972672532e-07,
      "loss": -0.0113,
      "num_tokens": 30328124.0,
      "reward": 0.8233011960983276,
      "reward_std": 0.21954698860645294,
      "rewards/reward_func/mean": 0.8233011960983276,
      "rewards/reward_func/std": 0.21954698860645294,
      "step": 1101,
      "step_time": 20.294959507882595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 208.1875,
      "completions/mean_terminated_length": 208.1875,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.2013857662677765,
      "epoch": 0.051042149143121815,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0598393976688385,
      "kl": 0.0010075128375319764,
      "learning_rate": 9.898008337193143e-07,
      "loss": 0.0009,
      "num_tokens": 30354879.0,
      "reward": 0.8500427007675171,
      "reward_std": 0.05378911271691322,
      "rewards/reward_func/mean": 0.8500427007675171,
      "rewards/reward_func/std": 0.053789105266332626,
      "step": 1102,
      "step_time": 21.27879510819912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 320.0,
      "completions/max_terminated_length": 320.0,
      "completions/mean_length": 245.625,
      "completions/mean_terminated_length": 245.625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.4276796504855156,
      "epoch": 0.05108846688281612,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07888320088386536,
      "kl": 0.0011481109831947833,
      "learning_rate": 9.897915701713757e-07,
      "loss": 0.089,
      "num_tokens": 30379417.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 1103,
      "step_time": 28.08647546172142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 203.625,
      "completions/mean_terminated_length": 203.625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.3153912052512169,
      "epoch": 0.05113478462251042,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1048893928527832,
      "kl": 0.0014552755455952138,
      "learning_rate": 9.897823066234368e-07,
      "loss": 0.0111,
      "num_tokens": 30408147.0,
      "reward": 0.3920607566833496,
      "reward_std": 0.29425516724586487,
      "rewards/reward_func/mean": 0.3920607566833496,
      "rewards/reward_func/std": 0.29425519704818726,
      "step": 1104,
      "step_time": 24.419979251921177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 229.375,
      "completions/mean_terminated_length": 229.375,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.3451824113726616,
      "epoch": 0.051181102362204724,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0886145755648613,
      "kl": 0.001503424602560699,
      "learning_rate": 9.89773043075498e-07,
      "loss": 0.0201,
      "num_tokens": 30446601.0,
      "reward": 0.9942312240600586,
      "reward_std": 0.012402480468153954,
      "rewards/reward_func/mean": 0.9942312240600586,
      "rewards/reward_func/std": 0.01240248791873455,
      "step": 1105,
      "step_time": 25.77444277703762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.24910329654812813,
      "epoch": 0.05122742010189903,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001084607094526291,
      "kl": 0.0010699391859816387,
      "learning_rate": 9.89763779527559e-07,
      "loss": 0.0001,
      "num_tokens": 30468519.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1106,
      "step_time": 18.155639626085758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 120.6875,
      "completions/mean_terminated_length": 120.6875,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.25526338815689087,
      "epoch": 0.05127373784159333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016850433312356472,
      "kl": 0.001595150592038408,
      "learning_rate": 9.897545159796202e-07,
      "loss": 0.0001,
      "num_tokens": 30488162.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1107,
      "step_time": 12.975851442664862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 192.8125,
      "completions/mean_terminated_length": 192.8125,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.3439238592982292,
      "epoch": 0.05132005558128763,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012190868146717548,
      "kl": 0.0012358661915641278,
      "learning_rate": 9.897452524316813e-07,
      "loss": 0.0001,
      "num_tokens": 30520335.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 1108,
      "step_time": 23.448437191545963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 173.75,
      "completions/mean_terminated_length": 173.75,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.22079867124557495,
      "epoch": 0.051366373320981935,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06497100740671158,
      "kl": 0.0009250911243725568,
      "learning_rate": 9.897359888837424e-07,
      "loss": -0.0125,
      "num_tokens": 30541467.0,
      "reward": 0.8230994939804077,
      "reward_std": 0.09352880716323853,
      "rewards/reward_func/mean": 0.8230994939804077,
      "rewards/reward_func/std": 0.09352879971265793,
      "step": 1109,
      "step_time": 18.13630971312523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 119.125,
      "completions/mean_terminated_length": 119.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2911180630326271,
      "epoch": 0.05141269106067624,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010315573308616877,
      "kl": 0.0011704214848577976,
      "learning_rate": 9.897267253358036e-07,
      "loss": 0.0001,
      "num_tokens": 30561229.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1110,
      "step_time": 12.498386699706316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 210.875,
      "completions/mean_terminated_length": 210.875,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.19843004271388054,
      "epoch": 0.05145900880037054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004728315689135343,
      "kl": 0.0006877919950056821,
      "learning_rate": 9.897174617878647e-07,
      "loss": 0.0,
      "num_tokens": 30593211.0,
      "reward": 0.9574533700942993,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9574533700942993,
      "rewards/reward_func/std": 0.0,
      "step": 1111,
      "step_time": 23.56514134258032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 159.4375,
      "completions/mean_terminated_length": 159.4375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.38042762130498886,
      "epoch": 0.051505326540064844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010632815537974238,
      "kl": 0.0014865275588817894,
      "learning_rate": 9.897081982399258e-07,
      "loss": 0.0001,
      "num_tokens": 30627218.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1112,
      "step_time": 19.564482927322388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 145.1875,
      "completions/mean_terminated_length": 145.1875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.1334095038473606,
      "epoch": 0.05155164427975915,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005674534477293491,
      "kl": 0.0005581184232141823,
      "learning_rate": 9.89698934691987e-07,
      "loss": 0.0,
      "num_tokens": 30657221.0,
      "reward": 0.8574039340019226,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8574039340019226,
      "rewards/reward_func/std": 0.0,
      "step": 1113,
      "step_time": 17.10247927904129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.35668937116861343,
      "epoch": 0.05159796201945345,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13557200133800507,
      "kl": 0.0018954368715640157,
      "learning_rate": 9.89689671144048e-07,
      "loss": 0.0713,
      "num_tokens": 30685237.0,
      "reward": 0.862541675567627,
      "reward_std": 0.23001109063625336,
      "rewards/reward_func/mean": 0.862541675567627,
      "rewards/reward_func/std": 0.23001112043857574,
      "step": 1114,
      "step_time": 20.952950689941645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 167.625,
      "completions/mean_terminated_length": 167.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.19039323180913925,
      "epoch": 0.05164427975914775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006320044631138444,
      "kl": 0.0007345816120505333,
      "learning_rate": 9.896804075961092e-07,
      "loss": 0.0,
      "num_tokens": 30706863.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 1115,
      "step_time": 17.534554477781057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 126.9375,
      "completions/mean_terminated_length": 126.9375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3573169931769371,
      "epoch": 0.051690597498842056,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010215246584266424,
      "kl": 0.001329431717749685,
      "learning_rate": 9.896711440481705e-07,
      "loss": 0.0001,
      "num_tokens": 30726974.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1116,
      "step_time": 14.324847247451544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 153.875,
      "completions/mean_terminated_length": 153.875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4384104534983635,
      "epoch": 0.05173691523853636,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011730300029739738,
      "kl": 0.001636853179661557,
      "learning_rate": 9.896618805002316e-07,
      "loss": 0.0001,
      "num_tokens": 30757676.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1117,
      "step_time": 18.594801753759384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 165.875,
      "completions/mean_terminated_length": 165.875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.17398259416222572,
      "epoch": 0.05178323297823066,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00389515096321702,
      "kl": 0.0020509135792963207,
      "learning_rate": 9.896526169522926e-07,
      "loss": 0.0001,
      "num_tokens": 30803322.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 1118,
      "step_time": 23.118459444493055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 125.75,
      "completions/mean_terminated_length": 125.75,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2766589969396591,
      "epoch": 0.051829550717924965,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018308153375983238,
      "kl": 0.0017926291620824486,
      "learning_rate": 9.896433534043539e-07,
      "loss": 0.0001,
      "num_tokens": 30832822.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1119,
      "step_time": 14.803225938230753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 125.0,
      "completions/mean_terminated_length": 125.0,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.28757575154304504,
      "epoch": 0.05187586845761927,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011737167369574308,
      "kl": 0.0012856294924858958,
      "learning_rate": 9.89634089856415e-07,
      "loss": 0.0001,
      "num_tokens": 30853286.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1120,
      "step_time": 13.634919803589582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 220.625,
      "completions/mean_terminated_length": 220.625,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "entropy": 0.23788436874747276,
      "epoch": 0.05192218619731357,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07899540662765503,
      "kl": 0.0010609175369609147,
      "learning_rate": 9.896248263084761e-07,
      "loss": -0.0196,
      "num_tokens": 30887120.0,
      "reward": 0.7089771628379822,
      "reward_std": 0.30815425515174866,
      "rewards/reward_func/mean": 0.7089771628379822,
      "rewards/reward_func/std": 0.30815425515174866,
      "step": 1121,
      "step_time": 23.72310246527195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 133.875,
      "completions/mean_terminated_length": 133.875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.23498577252030373,
      "epoch": 0.05196850393700787,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009531736723147333,
      "kl": 0.0010055435268441215,
      "learning_rate": 9.896155627605373e-07,
      "loss": 0.0001,
      "num_tokens": 30906974.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1122,
      "step_time": 14.71127225831151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 216.125,
      "completions/mean_terminated_length": 216.125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.31246964633464813,
      "epoch": 0.052014821676702176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07301823794841766,
      "kl": 0.0012709646834991872,
      "learning_rate": 9.896062992125984e-07,
      "loss": -0.0116,
      "num_tokens": 30942368.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 1123,
      "step_time": 27.333131555467844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 289.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 233.0,
      "completions/mean_terminated_length": 233.0,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.29382146522402763,
      "epoch": 0.05206113941639648,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07238540798425674,
      "kl": 0.0013140554656274617,
      "learning_rate": 9.895970356646595e-07,
      "loss": -0.0442,
      "num_tokens": 30973616.0,
      "reward": 0.18292942643165588,
      "reward_std": 0.07961179316043854,
      "rewards/reward_func/mean": 0.18292942643165588,
      "rewards/reward_func/std": 0.07961180061101913,
      "step": 1124,
      "step_time": 25.977344941347837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 131.3125,
      "completions/mean_terminated_length": 131.3125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2118467167019844,
      "epoch": 0.05210745715609078,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007855461444705725,
      "kl": 0.0009936098067555577,
      "learning_rate": 9.895877721167206e-07,
      "loss": 0.0,
      "num_tokens": 30993173.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1125,
      "step_time": 13.984401155263186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 188.5,
      "completions/mean_terminated_length": 188.5,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.43601953983306885,
      "epoch": 0.052153774895785085,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009769224561750889,
      "kl": 0.001453546603443101,
      "learning_rate": 9.895785085687818e-07,
      "loss": 0.0001,
      "num_tokens": 31016733.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1126,
      "step_time": 19.12609263509512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 138.375,
      "completions/mean_terminated_length": 138.375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.31714512407779694,
      "epoch": 0.05220009263547939,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007875919691286981,
      "kl": 0.0011351430439390242,
      "learning_rate": 9.895692450208429e-07,
      "loss": 0.0001,
      "num_tokens": 31039059.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1127,
      "step_time": 14.93588675931096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 134.875,
      "completions/mean_terminated_length": 134.875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2508166506886482,
      "epoch": 0.05224641037517369,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006805529352277517,
      "kl": 0.0027232636348344386,
      "learning_rate": 9.89559981472904e-07,
      "loss": 0.0001,
      "num_tokens": 31058641.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1128,
      "step_time": 14.223127357661724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 190.5625,
      "completions/mean_terminated_length": 190.5625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.3678119406104088,
      "epoch": 0.052292728114867994,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17108699679374695,
      "kl": 0.007267465902259573,
      "learning_rate": 9.895507179249653e-07,
      "loss": 0.0606,
      "num_tokens": 31082202.0,
      "reward": 0.40776169300079346,
      "reward_std": 0.4784238338470459,
      "rewards/reward_func/mean": 0.40776169300079346,
      "rewards/reward_func/std": 0.4784238338470459,
      "step": 1129,
      "step_time": 21.02282042056322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 128.1875,
      "completions/mean_terminated_length": 128.1875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2689135894179344,
      "epoch": 0.0523390458545623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010215247748419642,
      "kl": 0.0011080264812335372,
      "learning_rate": 9.895414543770265e-07,
      "loss": 0.0001,
      "num_tokens": 31102413.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1130,
      "step_time": 13.169312495738268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 198.9375,
      "completions/mean_terminated_length": 198.9375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.37807053327560425,
      "epoch": 0.0523853635942566,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14330777525901794,
      "kl": 0.0016479375190101564,
      "learning_rate": 9.895321908290874e-07,
      "loss": 0.0268,
      "num_tokens": 31131404.0,
      "reward": 0.28276169300079346,
      "reward_std": 0.43315792083740234,
      "rewards/reward_func/mean": 0.28276169300079346,
      "rewards/reward_func/std": 0.43315792083740234,
      "step": 1131,
      "step_time": 22.18549221381545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 195.5,
      "completions/mean_terminated_length": 195.5,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.2906291112303734,
      "epoch": 0.0524316813339509,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09652494639158249,
      "kl": 0.0015081569727044553,
      "learning_rate": 9.895229272811485e-07,
      "loss": -0.0877,
      "num_tokens": 31153652.0,
      "reward": 0.5340820550918579,
      "reward_std": 0.3908340036869049,
      "rewards/reward_func/mean": 0.5340820550918579,
      "rewards/reward_func/std": 0.3908340334892273,
      "step": 1132,
      "step_time": 20.599905110895634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 199.1875,
      "completions/mean_terminated_length": 199.1875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.3890107646584511,
      "epoch": 0.052477999073645205,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014516408555209637,
      "kl": 0.0018202054779976606,
      "learning_rate": 9.895136637332098e-07,
      "loss": 0.0001,
      "num_tokens": 31183367.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1133,
      "step_time": 24.62832786515355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 170.5625,
      "completions/mean_terminated_length": 170.5625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.199891347438097,
      "epoch": 0.05252431681333951,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007594860508106649,
      "kl": 0.0009866192995104939,
      "learning_rate": 9.89504400185271e-07,
      "loss": 0.0,
      "num_tokens": 31220416.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1134,
      "step_time": 20.979295525699854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 360.0,
      "completions/max_terminated_length": 360.0,
      "completions/mean_length": 223.875,
      "completions/mean_terminated_length": 223.875,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.15802664309740067,
      "epoch": 0.05257063455303381,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05633833259344101,
      "kl": 0.0006402080762200058,
      "learning_rate": 9.89495136637332e-07,
      "loss": 0.3036,
      "num_tokens": 31245774.0,
      "reward": 0.8136550188064575,
      "reward_std": 0.333344042301178,
      "rewards/reward_func/mean": 0.8136550188064575,
      "rewards/reward_func/std": 0.333344042301178,
      "step": 1135,
      "step_time": 29.257002096623182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 161.6875,
      "completions/mean_terminated_length": 161.6875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.202426265925169,
      "epoch": 0.052616952292728114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014544824371114373,
      "kl": 0.001244504179339856,
      "learning_rate": 9.894858730893932e-07,
      "loss": 0.0001,
      "num_tokens": 31270857.0,
      "reward": 0.48153844475746155,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.48153844475746155,
      "rewards/reward_func/std": 0.0,
      "step": 1136,
      "step_time": 17.424078673124313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 190.875,
      "completions/mean_terminated_length": 190.875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.23381546139717102,
      "epoch": 0.05266327003242242,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005960245616734028,
      "kl": 0.0009035322000272572,
      "learning_rate": 9.894766095414543e-07,
      "loss": 0.0,
      "num_tokens": 31306791.0,
      "reward": 0.8657099008560181,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8657099008560181,
      "rewards/reward_func/std": 0.0,
      "step": 1137,
      "step_time": 22.284610524773598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 172.5625,
      "completions/mean_terminated_length": 172.5625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.2746679037809372,
      "epoch": 0.05270958777211672,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000632460811175406,
      "kl": 0.0009402802097611129,
      "learning_rate": 9.894673459935155e-07,
      "loss": 0.0,
      "num_tokens": 31332064.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1138,
      "step_time": 17.917193945497274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 197.625,
      "completions/mean_terminated_length": 197.625,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.31648997217416763,
      "epoch": 0.05275590551181102,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0961669459939003,
      "kl": 0.0015563028864562511,
      "learning_rate": 9.894580824455766e-07,
      "loss": 0.0091,
      "num_tokens": 31361258.0,
      "reward": 0.5844430327415466,
      "reward_std": 0.19984734058380127,
      "rewards/reward_func/mean": 0.5844430327415466,
      "rewards/reward_func/std": 0.19984734058380127,
      "step": 1139,
      "step_time": 22.793893687427044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 151.0,
      "completions/mean_terminated_length": 151.0,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.41440724581480026,
      "epoch": 0.052802223251505326,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001131863216869533,
      "kl": 0.0013655726215802133,
      "learning_rate": 9.894488188976377e-07,
      "loss": 0.0001,
      "num_tokens": 31405834.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1140,
      "step_time": 21.52340066432953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 188.6875,
      "completions/mean_terminated_length": 188.6875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.4362773597240448,
      "epoch": 0.05284854099119963,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0015412763459607959,
      "kl": 0.002005956834182143,
      "learning_rate": 9.894395553496988e-07,
      "loss": 0.0001,
      "num_tokens": 31433925.0,
      "reward": 3.123703251617371e-08,
      "reward_std": 1.2494813006469485e-07,
      "rewards/reward_func/mean": 3.123703251617371e-08,
      "rewards/reward_func/std": 1.2494813006469485e-07,
      "step": 1141,
      "step_time": 21.189816020429134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 206.0625,
      "completions/mean_terminated_length": 206.0625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.33258073776960373,
      "epoch": 0.05289485873089393,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.061463385820388794,
      "kl": 0.0018322475953027606,
      "learning_rate": 9.8943029180176e-07,
      "loss": -0.0428,
      "num_tokens": 31466278.0,
      "reward": 0.9463515877723694,
      "reward_std": 0.09895078837871552,
      "rewards/reward_func/mean": 0.9463515877723694,
      "rewards/reward_func/std": 0.09895079582929611,
      "step": 1142,
      "step_time": 22.81053288653493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 191.0625,
      "completions/mean_terminated_length": 191.0625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.24916226789355278,
      "epoch": 0.052941176470588235,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006731266039423645,
      "kl": 0.0008510870393365622,
      "learning_rate": 9.89421028253821e-07,
      "loss": 0.0,
      "num_tokens": 31487991.0,
      "reward": 0.6411803960800171,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6411803960800171,
      "rewards/reward_func/std": 0.0,
      "step": 1143,
      "step_time": 19.348943434655666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 185.6875,
      "completions/mean_terminated_length": 185.6875,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.21044782549142838,
      "epoch": 0.05298749421028254,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08757368475198746,
      "kl": 0.0008568099146941677,
      "learning_rate": 9.894117647058822e-07,
      "loss": -0.0052,
      "num_tokens": 31511490.0,
      "reward": 0.9388407468795776,
      "reward_std": 0.019452007487416267,
      "rewards/reward_func/mean": 0.9388407468795776,
      "rewards/reward_func/std": 0.019452018663287163,
      "step": 1144,
      "step_time": 20.02642299607396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 148.25,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.22547230869531631,
      "epoch": 0.05303381194997684,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08268888294696808,
      "kl": 0.0016655647195875645,
      "learning_rate": 9.894025011579433e-07,
      "loss": 0.0064,
      "num_tokens": 31533286.0,
      "reward": 0.877037763595581,
      "reward_std": 0.23387674987316132,
      "rewards/reward_func/mean": 0.877037763595581,
      "rewards/reward_func/std": 0.2338767647743225,
      "step": 1145,
      "step_time": 17.61340966448188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 163.6875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.40419651567935944,
      "epoch": 0.05308012968967114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009964038617908955,
      "kl": 0.00159594661090523,
      "learning_rate": 9.893932376100047e-07,
      "loss": 0.0001,
      "num_tokens": 31554401.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1146,
      "step_time": 17.55034012719989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 157.0,
      "completions/mean_terminated_length": 157.0,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3126239702105522,
      "epoch": 0.053126447429365446,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022115272004157305,
      "kl": 0.0015431630599778146,
      "learning_rate": 9.893839740620658e-07,
      "loss": 0.0001,
      "num_tokens": 31591521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1147,
      "step_time": 19.716587338596582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.1232264544814825,
      "epoch": 0.05317276516905975,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009384130826219916,
      "kl": 0.0005930302359047346,
      "learning_rate": 9.89374710514127e-07,
      "loss": 0.0,
      "num_tokens": 31614532.0,
      "reward": 0.910879909992218,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.910879909992218,
      "rewards/reward_func/std": 0.0,
      "step": 1148,
      "step_time": 16.905188091099262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 294.625,
      "completions/mean_terminated_length": 294.625,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "entropy": 0.15040616318583488,
      "epoch": 0.05321908290875405,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07871180027723312,
      "kl": 0.0007263211882673204,
      "learning_rate": 9.89365446966188e-07,
      "loss": 0.0078,
      "num_tokens": 31642126.0,
      "reward": 0.7928786277770996,
      "reward_std": 0.11949107050895691,
      "rewards/reward_func/mean": 0.7928786277770996,
      "rewards/reward_func/std": 0.11949107050895691,
      "step": 1149,
      "step_time": 27.762278582900763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 337.0,
      "completions/max_terminated_length": 337.0,
      "completions/mean_length": 200.3125,
      "completions/mean_terminated_length": 200.3125,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.29706262052059174,
      "epoch": 0.053265400648448355,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10142429172992706,
      "kl": 0.0018482427985873073,
      "learning_rate": 9.893561834182492e-07,
      "loss": 0.0923,
      "num_tokens": 31665299.0,
      "reward": 0.18208497762680054,
      "reward_std": 0.12976574897766113,
      "rewards/reward_func/mean": 0.18208497762680054,
      "rewards/reward_func/std": 0.12976574897766113,
      "step": 1150,
      "step_time": 29.01048344746232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 211.625,
      "completions/mean_terminated_length": 211.625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.37995871901512146,
      "epoch": 0.05331171838814266,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00652031134814024,
      "kl": 0.0019164951227139682,
      "learning_rate": 9.893469198703103e-07,
      "loss": 0.0001,
      "num_tokens": 31697565.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1151,
      "step_time": 26.846133541315794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 185.5625,
      "completions/mean_terminated_length": 185.5625,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.38745927810668945,
      "epoch": 0.05335803612783696,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008911601034924388,
      "kl": 0.0012029419594909996,
      "learning_rate": 9.893376563223714e-07,
      "loss": 0.0001,
      "num_tokens": 31722310.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1152,
      "step_time": 21.08556577935815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 203.0,
      "completions/mean_terminated_length": 203.0,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4134936258196831,
      "epoch": 0.053404353867531264,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010713719530031085,
      "kl": 0.00138545356458053,
      "learning_rate": 9.893283927744326e-07,
      "loss": 0.0001,
      "num_tokens": 31749766.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1153,
      "step_time": 22.26845372840762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 160.75,
      "completions/mean_terminated_length": 160.75,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.14946546405553818,
      "epoch": 0.05345067160722557,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008816413464955986,
      "kl": 0.0009540772443870082,
      "learning_rate": 9.893191292264937e-07,
      "loss": 0.0,
      "num_tokens": 31778002.0,
      "reward": 0.9534969329833984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9534969329833984,
      "rewards/reward_func/std": 0.0,
      "step": 1154,
      "step_time": 17.70001668483019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4476577043533325,
      "epoch": 0.05349698934691987,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009187766700051725,
      "kl": 0.0016943172086030245,
      "learning_rate": 9.893098656785548e-07,
      "loss": 0.0001,
      "num_tokens": 31820182.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1155,
      "step_time": 23.56076095253229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 151.6875,
      "completions/mean_terminated_length": 151.6875,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3805708736181259,
      "epoch": 0.05354330708661417,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012020737631246448,
      "kl": 0.0014396813930943608,
      "learning_rate": 9.89300602130616e-07,
      "loss": 0.0001,
      "num_tokens": 31841009.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1156,
      "step_time": 16.54355525225401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 183.5625,
      "completions/mean_terminated_length": 183.5625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.37823300063610077,
      "epoch": 0.053589624826308475,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013947762781754136,
      "kl": 0.001362399838399142,
      "learning_rate": 9.89291338582677e-07,
      "loss": 0.0001,
      "num_tokens": 31869370.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1157,
      "step_time": 23.252543453127146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 127.0,
      "completions/mean_terminated_length": 127.0,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3380546420812607,
      "epoch": 0.05363594256600278,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002495003864169121,
      "kl": 0.0019408497610129416,
      "learning_rate": 9.892820750347382e-07,
      "loss": 0.0001,
      "num_tokens": 31901066.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1158,
      "step_time": 17.884928941726685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 128.5625,
      "completions/mean_terminated_length": 128.5625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2847992070019245,
      "epoch": 0.05368226030569708,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015233656158670783,
      "kl": 0.0013830008392687887,
      "learning_rate": 9.892728114867995e-07,
      "loss": 0.0001,
      "num_tokens": 31920691.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1159,
      "step_time": 15.752903632819653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 199.125,
      "completions/mean_terminated_length": 199.125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.23879999667406082,
      "epoch": 0.053728578045391384,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12035561352968216,
      "kl": 0.0010685274610295892,
      "learning_rate": 9.892635479388606e-07,
      "loss": -0.0452,
      "num_tokens": 31953173.0,
      "reward": 0.5953280925750732,
      "reward_std": 0.23239226639270782,
      "rewards/reward_func/mean": 0.5953280925750732,
      "rewards/reward_func/std": 0.23239228129386902,
      "step": 1160,
      "step_time": 22.353445518761873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 165.875,
      "completions/mean_terminated_length": 165.875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.1503465175628662,
      "epoch": 0.05377489578508569,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005322801880538464,
      "kl": 0.0007290776702575386,
      "learning_rate": 9.892542843909216e-07,
      "loss": 0.0,
      "num_tokens": 31990947.0,
      "reward": 0.9534969329833984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9534969329833984,
      "rewards/reward_func/std": 0.0,
      "step": 1161,
      "step_time": 21.50071908161044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 192.0625,
      "completions/mean_terminated_length": 192.0625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.4274817705154419,
      "epoch": 0.05382121352477999,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010776615235954523,
      "kl": 0.0014149823691695929,
      "learning_rate": 9.892450208429827e-07,
      "loss": 0.0001,
      "num_tokens": 32022468.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1162,
      "step_time": 22.83984173834324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 144.875,
      "completions/mean_terminated_length": 144.875,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.13758426159620285,
      "epoch": 0.05386753126447429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005651933024637401,
      "kl": 0.0006001776200719178,
      "learning_rate": 9.89235757295044e-07,
      "loss": 0.0,
      "num_tokens": 32043794.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1163,
      "step_time": 17.197531394660473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 158.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3581901490688324,
      "epoch": 0.053913849004168596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011381652439013124,
      "kl": 0.0012314816704019904,
      "learning_rate": 9.892264937471051e-07,
      "loss": 0.0001,
      "num_tokens": 32072748.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1164,
      "step_time": 18.88974129408598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 143.4375,
      "completions/mean_terminated_length": 143.4375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2144453153014183,
      "epoch": 0.0539601667438629,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14213523268699646,
      "kl": 0.0011701754992827773,
      "learning_rate": 9.892172301991663e-07,
      "loss": -0.0588,
      "num_tokens": 32092979.0,
      "reward": 0.9465265274047852,
      "reward_std": 0.0712980255484581,
      "rewards/reward_func/mean": 0.9465265274047852,
      "rewards/reward_func/std": 0.0712980329990387,
      "step": 1165,
      "step_time": 15.757043085992336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 177.8125,
      "completions/mean_terminated_length": 177.8125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3314700797200203,
      "epoch": 0.0540064844835572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13920459151268005,
      "kl": 0.001862278499174863,
      "learning_rate": 9.892079666512274e-07,
      "loss": -0.0647,
      "num_tokens": 32114688.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 1166,
      "step_time": 19.252146907150745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 116.375,
      "completions/mean_terminated_length": 116.375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2919832020998001,
      "epoch": 0.054052802223251505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001161219784989953,
      "kl": 0.0014462788676610216,
      "learning_rate": 9.891987031032885e-07,
      "loss": 0.0001,
      "num_tokens": 32134950.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1167,
      "step_time": 12.909465335309505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 111.625,
      "completions/mean_terminated_length": 111.625,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.2536385953426361,
      "epoch": 0.05409911996294581,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01523127406835556,
      "kl": 0.0036446138692554086,
      "learning_rate": 9.891894395553496e-07,
      "loss": 0.0002,
      "num_tokens": 32154224.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1168,
      "step_time": 12.699834518134594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 141.75,
      "completions/mean_terminated_length": 141.75,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3179233968257904,
      "epoch": 0.05414543770264011,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001962218666449189,
      "kl": 0.001700406864983961,
      "learning_rate": 9.891801760074108e-07,
      "loss": 0.0001,
      "num_tokens": 32177292.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1169,
      "step_time": 15.205468282103539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 122.9375,
      "completions/mean_terminated_length": 122.9375,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.26094047352671623,
      "epoch": 0.05419175544233441,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013599522644653916,
      "kl": 0.0013964295794721693,
      "learning_rate": 9.891709124594719e-07,
      "loss": 0.0001,
      "num_tokens": 32197835.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1170,
      "step_time": 13.122005488723516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 129.0,
      "completions/mean_terminated_length": 129.0,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.36509843170642853,
      "epoch": 0.054238073182028716,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004078730009496212,
      "kl": 0.002764565113466233,
      "learning_rate": 9.89161648911533e-07,
      "loss": 0.0001,
      "num_tokens": 32234411.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1171,
      "step_time": 18.717920504510403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 141.9375,
      "completions/mean_terminated_length": 141.9375,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.35093528777360916,
      "epoch": 0.05428439092172302,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012465318432077765,
      "kl": 0.001481231243815273,
      "learning_rate": 9.891523853635941e-07,
      "loss": 0.0001,
      "num_tokens": 32259306.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1172,
      "step_time": 15.76487848162651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 219.1875,
      "completions/mean_terminated_length": 219.1875,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.19795706868171692,
      "epoch": 0.05433070866141732,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07324954867362976,
      "kl": 0.0010112493037013337,
      "learning_rate": 9.891431218156555e-07,
      "loss": -0.0311,
      "num_tokens": 32297421.0,
      "reward": 0.8825951814651489,
      "reward_std": 0.10946193337440491,
      "rewards/reward_func/mean": 0.8825951814651489,
      "rewards/reward_func/std": 0.10946192592382431,
      "step": 1173,
      "step_time": 24.80542005226016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 189.6875,
      "completions/mean_terminated_length": 189.6875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.419601745903492,
      "epoch": 0.054377026401111625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008468477753922343,
      "kl": 0.0015795796643942595,
      "learning_rate": 9.891338582677164e-07,
      "loss": 0.0001,
      "num_tokens": 32334888.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1174,
      "step_time": 23.496656615287066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 155.1875,
      "completions/mean_terminated_length": 155.1875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.20617348700761795,
      "epoch": 0.05442334414080593,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014567967737093568,
      "kl": 0.001166216330602765,
      "learning_rate": 9.891245947197775e-07,
      "loss": 0.0001,
      "num_tokens": 32361995.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1175,
      "step_time": 17.300379287451506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 238.625,
      "completions/mean_terminated_length": 238.625,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.4495939090847969,
      "epoch": 0.05446966188050023,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009323477279394865,
      "kl": 0.0013735440734308213,
      "learning_rate": 9.891153311718389e-07,
      "loss": 0.0001,
      "num_tokens": 32389189.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1176,
      "step_time": 27.233849480748177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 138.875,
      "completions/mean_terminated_length": 138.875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3000299036502838,
      "epoch": 0.054515979620194534,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003468952374532819,
      "kl": 0.002486981393303722,
      "learning_rate": 9.891060676239e-07,
      "loss": 0.0001,
      "num_tokens": 32411219.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1177,
      "step_time": 16.326974477618933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 195.875,
      "completions/mean_terminated_length": 195.875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.42353833466768265,
      "epoch": 0.05456229735988884,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015964413760229945,
      "kl": 0.0015052240050863475,
      "learning_rate": 9.89096804075961e-07,
      "loss": 0.0001,
      "num_tokens": 32435569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1178,
      "step_time": 20.23856322094798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 195.375,
      "completions/mean_terminated_length": 195.375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.31634391844272614,
      "epoch": 0.05460861509958314,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11076902598142624,
      "kl": 0.0015534661069978029,
      "learning_rate": 9.890875405280222e-07,
      "loss": -0.0165,
      "num_tokens": 32471479.0,
      "reward": 0.9502800703048706,
      "reward_std": 0.07616504281759262,
      "rewards/reward_func/mean": 0.9502800703048706,
      "rewards/reward_func/std": 0.07616503536701202,
      "step": 1179,
      "step_time": 22.191253505647182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 155.125,
      "completions/mean_terminated_length": 155.125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.43323180079460144,
      "epoch": 0.05465493283927744,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010547670535743237,
      "kl": 0.0017853601893875748,
      "learning_rate": 9.890782769800834e-07,
      "loss": 0.0001,
      "num_tokens": 32501641.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1180,
      "step_time": 18.018289901316166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 175.0625,
      "completions/mean_terminated_length": 175.0625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3106352239847183,
      "epoch": 0.054701250578971745,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018181770574301481,
      "kl": 0.0016223404090851545,
      "learning_rate": 9.890690134321445e-07,
      "loss": 0.0001,
      "num_tokens": 32527018.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1181,
      "step_time": 18.92172461748123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 168.5,
      "completions/mean_terminated_length": 168.5,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.39867446571588516,
      "epoch": 0.05474756831866605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001653126673772931,
      "kl": 0.001793490257114172,
      "learning_rate": 9.890597498842056e-07,
      "loss": 0.0001,
      "num_tokens": 32573906.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1182,
      "step_time": 23.76753769442439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 121.0,
      "completions/mean_terminated_length": 121.0,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2648680992424488,
      "epoch": 0.05479388605836035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011571758659556508,
      "kl": 0.0013324348838068545,
      "learning_rate": 9.890504863362667e-07,
      "loss": 0.0001,
      "num_tokens": 32593970.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1183,
      "step_time": 12.634309638291597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 202.5625,
      "completions/mean_terminated_length": 202.5625,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.2193916030228138,
      "epoch": 0.054840203798054654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024302343372255564,
      "kl": 0.001229857763973996,
      "learning_rate": 9.890412227883279e-07,
      "loss": 0.0001,
      "num_tokens": 32629547.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1184,
      "step_time": 23.20802417770028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 169.4375,
      "completions/mean_terminated_length": 169.4375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.34451137483119965,
      "epoch": 0.05488652153774896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021194086875766516,
      "kl": 0.0012443136802176014,
      "learning_rate": 9.89031959240389e-07,
      "loss": 0.0001,
      "num_tokens": 32650642.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1185,
      "step_time": 19.13880906626582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 139.625,
      "completions/mean_terminated_length": 139.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2950673848390579,
      "epoch": 0.05493283927744326,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001744111068546772,
      "kl": 0.0014879111549817026,
      "learning_rate": 9.8902269569245e-07,
      "loss": 0.0001,
      "num_tokens": 32670204.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1186,
      "step_time": 16.21994575858116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 127.6875,
      "completions/mean_terminated_length": 127.6875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3094567731022835,
      "epoch": 0.05497915701713756,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009803370339795947,
      "kl": 0.0011915433715330437,
      "learning_rate": 9.890134321445112e-07,
      "loss": 0.0001,
      "num_tokens": 32692039.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1187,
      "step_time": 13.96587160229683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 188.125,
      "completions/mean_terminated_length": 188.125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.1543944925069809,
      "epoch": 0.055025474756831866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006705935113132,
      "kl": 0.0017742169729899615,
      "learning_rate": 9.890041685965724e-07,
      "loss": 0.0001,
      "num_tokens": 32727337.0,
      "reward": 0.9343348145484924,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9343348145484924,
      "rewards/reward_func/std": 0.0,
      "step": 1188,
      "step_time": 21.417829602956772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 127.0625,
      "completions/mean_terminated_length": 127.0625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2924730107188225,
      "epoch": 0.05507179249652617,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020096004009246826,
      "kl": 0.001471274415962398,
      "learning_rate": 9.889949050486337e-07,
      "loss": 0.0001,
      "num_tokens": 32749546.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1189,
      "step_time": 14.448669698089361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 136.375,
      "completions/mean_terminated_length": 136.375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.31685884296894073,
      "epoch": 0.05511811023622047,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013721493305638433,
      "kl": 0.00156321088434197,
      "learning_rate": 9.889856415006948e-07,
      "loss": 0.0001,
      "num_tokens": 32779216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1190,
      "step_time": 16.542190439999104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 144.6875,
      "completions/mean_terminated_length": 144.6875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.20903745666146278,
      "epoch": 0.055164427975914775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001146889291703701,
      "kl": 0.0010980116057908162,
      "learning_rate": 9.88976377952756e-07,
      "loss": 0.0001,
      "num_tokens": 32803179.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 1191,
      "step_time": 15.837680958211422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 136.1875,
      "completions/mean_terminated_length": 136.1875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.28850478678941727,
      "epoch": 0.05521074571560908,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017243772745132446,
      "kl": 0.0012638849730137736,
      "learning_rate": 9.889671144048169e-07,
      "loss": 0.0001,
      "num_tokens": 32822974.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1192,
      "step_time": 14.529932048171759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 145.6875,
      "completions/mean_terminated_length": 145.6875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3776225745677948,
      "epoch": 0.05525706345530338,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002197980647906661,
      "kl": 0.002267822274006903,
      "learning_rate": 9.889578508568782e-07,
      "loss": 0.0001,
      "num_tokens": 32879657.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1193,
      "step_time": 25.480321776121855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 126.4375,
      "completions/mean_terminated_length": 126.4375,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.3518354445695877,
      "epoch": 0.05530338119499768,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002201296156272292,
      "kl": 0.0016532180598005652,
      "learning_rate": 9.889485873089393e-07,
      "loss": 0.0001,
      "num_tokens": 32905280.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1194,
      "step_time": 15.141155265271664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 215.125,
      "completions/mean_terminated_length": 215.125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.25013238564133644,
      "epoch": 0.055349698934691986,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06812913715839386,
      "kl": 0.0009467493218835443,
      "learning_rate": 9.889393237610004e-07,
      "loss": -0.0446,
      "num_tokens": 32938930.0,
      "reward": 0.8982654809951782,
      "reward_std": 0.005985993891954422,
      "rewards/reward_func/mean": 0.8982654809951782,
      "rewards/reward_func/std": 0.005985994357615709,
      "step": 1195,
      "step_time": 23.7807700894773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.36999668926000595,
      "epoch": 0.05539601667438629,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000533161626663059,
      "kl": 0.0010291463549947366,
      "learning_rate": 9.889300602130616e-07,
      "loss": 0.0001,
      "num_tokens": 32968164.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1196,
      "step_time": 20.96423965319991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 120.5,
      "completions/mean_terminated_length": 120.5,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.23959479480981827,
      "epoch": 0.05544233441408059,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003938731271773577,
      "kl": 0.001805234351195395,
      "learning_rate": 9.889207966651227e-07,
      "loss": 0.0001,
      "num_tokens": 32988748.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1197,
      "step_time": 13.279081158339977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 163.5,
      "completions/mean_terminated_length": 163.5,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.4478013291954994,
      "epoch": 0.055488652153774895,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016535435570403934,
      "kl": 0.0017541599518153816,
      "learning_rate": 9.889115331171838e-07,
      "loss": 0.0001,
      "num_tokens": 33033604.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1198,
      "step_time": 23.09657371416688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.35072455555200577,
      "epoch": 0.0555349698934692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016030474798753858,
      "kl": 0.001472942967666313,
      "learning_rate": 9.88902269569245e-07,
      "loss": 0.0001,
      "num_tokens": 33053974.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1199,
      "step_time": 19.738952070474625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 121.0,
      "completions/max_terminated_length": 121.0,
      "completions/mean_length": 105.125,
      "completions/mean_terminated_length": 105.125,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "entropy": 0.2769193649291992,
      "epoch": 0.0555812876331635,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002373273018747568,
      "kl": 0.0016173394105862826,
      "learning_rate": 9.88893006021306e-07,
      "loss": 0.0001,
      "num_tokens": 33075656.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1200,
      "step_time": 12.320656727999449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 186.0,
      "completions/mean_terminated_length": 186.0,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.34736139327287674,
      "epoch": 0.055627605372857804,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1093740463256836,
      "kl": 0.0011903214908670634,
      "learning_rate": 9.888837424733672e-07,
      "loss": -0.0134,
      "num_tokens": 33099016.0,
      "reward": 0.856032133102417,
      "reward_std": 0.22827444970607758,
      "rewards/reward_func/mean": 0.856032133102417,
      "rewards/reward_func/std": 0.22827443480491638,
      "step": 1201,
      "step_time": 20.020185366272926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 222.125,
      "completions/mean_terminated_length": 222.125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.37158964574337006,
      "epoch": 0.05567392311255211,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09585659950971603,
      "kl": 0.001849939435487613,
      "learning_rate": 9.888744789254283e-07,
      "loss": -0.0273,
      "num_tokens": 33137322.0,
      "reward": 0.14222227036952972,
      "reward_std": 0.12954173982143402,
      "rewards/reward_func/mean": 0.14222227036952972,
      "rewards/reward_func/std": 0.12954175472259521,
      "step": 1202,
      "step_time": 27.632737696170807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 157.1875,
      "completions/mean_terminated_length": 157.1875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.2809433713555336,
      "epoch": 0.05572024085224641,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031857306603342295,
      "kl": 0.0016545486578252167,
      "learning_rate": 9.888652153774896e-07,
      "loss": 0.0001,
      "num_tokens": 33173853.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1203,
      "step_time": 20.077237356454134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.0,
      "completions/max_terminated_length": 127.0,
      "completions/mean_length": 106.375,
      "completions/mean_terminated_length": 106.375,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.28098398447036743,
      "epoch": 0.05576655859194071,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024516689591109753,
      "kl": 0.001706851733615622,
      "learning_rate": 9.888559518295508e-07,
      "loss": 0.0001,
      "num_tokens": 33193779.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1204,
      "step_time": 12.8146958835423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 169.75,
      "completions/mean_terminated_length": 169.75,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.41700856387615204,
      "epoch": 0.055812876331635015,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019709744956344366,
      "kl": 0.0015683734090998769,
      "learning_rate": 9.888466882816117e-07,
      "loss": 0.0001,
      "num_tokens": 33217503.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1205,
      "step_time": 18.581989858299494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 127.9375,
      "completions/mean_terminated_length": 127.9375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2673864811658859,
      "epoch": 0.05585919407132932,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016961885849013925,
      "kl": 0.0012344536662567407,
      "learning_rate": 9.88837424733673e-07,
      "loss": 0.0001,
      "num_tokens": 33237262.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1206,
      "step_time": 13.16928181797266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 220.625,
      "completions/mean_terminated_length": 220.625,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.3380234017968178,
      "epoch": 0.05590551181102362,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0912645012140274,
      "kl": 0.0015399760741274804,
      "learning_rate": 9.888281611857341e-07,
      "loss": -0.0836,
      "num_tokens": 33263304.0,
      "reward": 0.4474213719367981,
      "reward_std": 0.49688494205474854,
      "rewards/reward_func/mean": 0.4474213719367981,
      "rewards/reward_func/std": 0.4968849718570709,
      "step": 1207,
      "step_time": 23.88822455331683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 143.5625,
      "completions/mean_terminated_length": 143.5625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2060914896428585,
      "epoch": 0.055951829550717924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038170551415532827,
      "kl": 0.001533936767373234,
      "learning_rate": 9.888188976377953e-07,
      "loss": 0.0001,
      "num_tokens": 33283761.0,
      "reward": 0.7420884966850281,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7420884966850281,
      "rewards/reward_func/std": 0.0,
      "step": 1208,
      "step_time": 14.947349477559328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 127.5,
      "completions/mean_terminated_length": 127.5,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2600145637989044,
      "epoch": 0.05599814729041223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0047537581995129585,
      "kl": 0.001497937657404691,
      "learning_rate": 9.888096340898564e-07,
      "loss": 0.0001,
      "num_tokens": 33304233.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1209,
      "step_time": 13.457011204212904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 269.0,
      "completions/max_terminated_length": 269.0,
      "completions/mean_length": 212.0625,
      "completions/mean_terminated_length": 212.0625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.40265387296676636,
      "epoch": 0.05604446503010653,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001125058624893427,
      "kl": 0.001514648989541456,
      "learning_rate": 9.888003705419175e-07,
      "loss": 0.0001,
      "num_tokens": 33335530.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1210,
      "step_time": 24.818960841745138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 174.1875,
      "completions/mean_terminated_length": 174.1875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.391539990901947,
      "epoch": 0.05609078276980083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011921962723135948,
      "kl": 0.0015644820232409984,
      "learning_rate": 9.887911069939786e-07,
      "loss": 0.0001,
      "num_tokens": 33370685.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1211,
      "step_time": 20.84784833714366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 164.3125,
      "completions/mean_terminated_length": 164.3125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.35753049701452255,
      "epoch": 0.056137100509495136,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001809712266549468,
      "kl": 0.0012707883870461956,
      "learning_rate": 9.887818434460398e-07,
      "loss": 0.0001,
      "num_tokens": 33404258.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1212,
      "step_time": 19.671447813510895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 187.625,
      "completions/mean_terminated_length": 187.625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.22351354733109474,
      "epoch": 0.05618341824918944,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008563286392018199,
      "kl": 0.0010690609051380306,
      "learning_rate": 9.88772579898101e-07,
      "loss": 0.0001,
      "num_tokens": 33439500.0,
      "reward": 0.8089976906776428,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8089976906776428,
      "rewards/reward_func/std": 0.0,
      "step": 1213,
      "step_time": 22.688255954533815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 126.4375,
      "completions/mean_terminated_length": 126.4375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.22193920984864235,
      "epoch": 0.05622973598888374,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006390352500602603,
      "kl": 0.0008537031535524875,
      "learning_rate": 9.88763316350162e-07,
      "loss": 0.0,
      "num_tokens": 33465971.0,
      "reward": 0.1533549726009369,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.1533549726009369,
      "rewards/reward_func/std": 0.0,
      "step": 1214,
      "step_time": 15.766603652387857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 208.0625,
      "completions/mean_terminated_length": 208.0625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.26157864928245544,
      "epoch": 0.056276053728578045,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07187444716691971,
      "kl": 0.0011387507256586105,
      "learning_rate": 9.887540528022231e-07,
      "loss": -0.0041,
      "num_tokens": 33501764.0,
      "reward": 0.8787410259246826,
      "reward_std": 0.029620526358485222,
      "rewards/reward_func/mean": 0.8787410259246826,
      "rewards/reward_func/std": 0.02962053380906582,
      "step": 1215,
      "step_time": 22.862708177417517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 130.6875,
      "completions/mean_terminated_length": 130.6875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2815399691462517,
      "epoch": 0.05632237146827235,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015339228324592113,
      "kl": 0.0035644937597680837,
      "learning_rate": 9.887447892542845e-07,
      "loss": 0.0002,
      "num_tokens": 33524079.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1216,
      "step_time": 16.1451876796782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 172.3125,
      "completions/mean_terminated_length": 172.3125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.37928324192762375,
      "epoch": 0.05636868920796665,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00085402064723894,
      "kl": 0.0013867141969967633,
      "learning_rate": 9.887355257063454e-07,
      "loss": 0.0001,
      "num_tokens": 33550820.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1217,
      "step_time": 18.440692875534296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 141.125,
      "completions/mean_terminated_length": 141.125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2438240870833397,
      "epoch": 0.05641500694766095,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002116305520758033,
      "kl": 0.0013620586250908673,
      "learning_rate": 9.887262621584065e-07,
      "loss": 0.0001,
      "num_tokens": 33570518.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1218,
      "step_time": 15.654694091528654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 121.3125,
      "completions/mean_terminated_length": 121.3125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.3376108705997467,
      "epoch": 0.056461324687355256,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010410158894956112,
      "kl": 0.0020358639303594828,
      "learning_rate": 9.887169986104679e-07,
      "loss": 0.0001,
      "num_tokens": 33594411.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1219,
      "step_time": 14.020819757133722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 167.75,
      "completions/mean_terminated_length": 167.75,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.4610692411661148,
      "epoch": 0.05650764242704956,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010968097485601902,
      "kl": 0.0017872247844934464,
      "learning_rate": 9.88707735062529e-07,
      "loss": 0.0001,
      "num_tokens": 33643911.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1220,
      "step_time": 24.36411403864622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 164.0625,
      "completions/mean_terminated_length": 164.0625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.42141059041023254,
      "epoch": 0.05655396016674386,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008742156787775457,
      "kl": 0.0014788057305850089,
      "learning_rate": 9.886984715145901e-07,
      "loss": 0.0001,
      "num_tokens": 33686088.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1221,
      "step_time": 21.88529008999467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 144.375,
      "completions/mean_terminated_length": 144.375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.411409430205822,
      "epoch": 0.056600277906438165,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018694042228162289,
      "kl": 0.0017541041306685656,
      "learning_rate": 9.886892079666512e-07,
      "loss": 0.0001,
      "num_tokens": 33709326.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1222,
      "step_time": 16.851913671940565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.460119292140007,
      "epoch": 0.05664659564613247,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016819029115140438,
      "kl": 0.0019150854786857963,
      "learning_rate": 9.886799444187124e-07,
      "loss": 0.0001,
      "num_tokens": 33752936.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1223,
      "step_time": 25.403372287750244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 142.75,
      "completions/mean_terminated_length": 142.75,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.35317961871623993,
      "epoch": 0.05669291338582677,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015910225920379162,
      "kl": 0.0015965857601258904,
      "learning_rate": 9.886706808707735e-07,
      "loss": 0.0001,
      "num_tokens": 33774436.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1224,
      "step_time": 15.178873017430305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 140.125,
      "completions/mean_terminated_length": 140.125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.31188593804836273,
      "epoch": 0.056739231125521074,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028630243614315987,
      "kl": 0.0015592757263220847,
      "learning_rate": 9.886614173228346e-07,
      "loss": 0.0001,
      "num_tokens": 33795334.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1225,
      "step_time": 16.386144682765007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 126.1875,
      "completions/mean_terminated_length": 126.1875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.283845491707325,
      "epoch": 0.05678554886521538,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033356109634041786,
      "kl": 0.0016911396523937583,
      "learning_rate": 9.886521537748957e-07,
      "loss": 0.0001,
      "num_tokens": 33819289.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1226,
      "step_time": 14.298079270869493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 159.75,
      "completions/mean_terminated_length": 159.75,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3166779801249504,
      "epoch": 0.05683186660490968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013046388048678637,
      "kl": 0.0015917142445687205,
      "learning_rate": 9.886428902269569e-07,
      "loss": 0.0001,
      "num_tokens": 33846245.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1227,
      "step_time": 18.61289867013693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 217.5,
      "completions/mean_terminated_length": 217.5,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.21056405827403069,
      "epoch": 0.05687818434460398,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07488119602203369,
      "kl": 0.0014644028851762414,
      "learning_rate": 9.88633626679018e-07,
      "loss": 0.0307,
      "num_tokens": 33884749.0,
      "reward": 0.6096057891845703,
      "reward_std": 0.024398334324359894,
      "rewards/reward_func/mean": 0.6096057891845703,
      "rewards/reward_func/std": 0.02439834736287594,
      "step": 1228,
      "step_time": 25.749790344387293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 169.5625,
      "completions/mean_terminated_length": 169.5625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.38020822405815125,
      "epoch": 0.056924502084298285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016944898525252938,
      "kl": 0.001812828064430505,
      "learning_rate": 9.886243631310791e-07,
      "loss": 0.0001,
      "num_tokens": 33911494.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1229,
      "step_time": 19.604694467037916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 242.8125,
      "completions/mean_terminated_length": 242.8125,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "entropy": 0.27098196744918823,
      "epoch": 0.05697081982399259,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004719934950117022,
      "kl": 0.000758229085477069,
      "learning_rate": 9.886150995831402e-07,
      "loss": 0.0,
      "num_tokens": 33943395.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1230,
      "step_time": 26.482955258339643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 129.6875,
      "completions/mean_terminated_length": 129.6875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2782093659043312,
      "epoch": 0.05701713756368689,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008192937239073217,
      "kl": 0.0009703707764856517,
      "learning_rate": 9.886058360352014e-07,
      "loss": 0.0,
      "num_tokens": 33967374.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1231,
      "step_time": 15.097816903144121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 195.875,
      "completions/mean_terminated_length": 195.875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.40940943360328674,
      "epoch": 0.057063455303381194,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016419700114056468,
      "kl": 0.002018047438468784,
      "learning_rate": 9.885965724872625e-07,
      "loss": 0.0001,
      "num_tokens": 34000716.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1232,
      "step_time": 23.017534095793962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 128.9375,
      "completions/mean_terminated_length": 128.9375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2268138900399208,
      "epoch": 0.0571097730430755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001921424176543951,
      "kl": 0.0010678358376026154,
      "learning_rate": 9.885873089393238e-07,
      "loss": 0.0001,
      "num_tokens": 34020619.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1233,
      "step_time": 13.981297962367535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 141.5625,
      "completions/mean_terminated_length": 141.5625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3581574782729149,
      "epoch": 0.0571560907827698,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008119179983623326,
      "kl": 0.001180766790639609,
      "learning_rate": 9.88578045391385e-07,
      "loss": 0.0001,
      "num_tokens": 34041972.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1234,
      "step_time": 15.030056152492762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 351.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 322.0,
      "completions/mean_terminated_length": 322.0,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "entropy": 0.245905090123415,
      "epoch": 0.0572024085224641,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12357914447784424,
      "kl": 0.0013075890019536018,
      "learning_rate": 9.885687818434459e-07,
      "loss": -0.0504,
      "num_tokens": 34078116.0,
      "reward": 0.9262726306915283,
      "reward_std": 0.24700602889060974,
      "rewards/reward_func/mean": 0.9262726306915283,
      "rewards/reward_func/std": 0.24700602889060974,
      "step": 1235,
      "step_time": 32.29124540835619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.35061635822057724,
      "epoch": 0.057248726262158406,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010514730820432305,
      "kl": 0.0012754994095303118,
      "learning_rate": 9.885595182955072e-07,
      "loss": 0.0001,
      "num_tokens": 34099454.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1236,
      "step_time": 14.005634594708681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 155.9375,
      "completions/mean_terminated_length": 155.9375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.32259558141231537,
      "epoch": 0.05729504400185271,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08124643564224243,
      "kl": 0.0022128417913336307,
      "learning_rate": 9.885502547475683e-07,
      "loss": -0.0096,
      "num_tokens": 34120189.0,
      "reward": 0.058713316917419434,
      "reward_std": 0.23485326766967773,
      "rewards/reward_func/mean": 0.058713316917419434,
      "rewards/reward_func/std": 0.23485328257083893,
      "step": 1237,
      "step_time": 16.244460076093674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 149.875,
      "completions/mean_terminated_length": 149.875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.24214792996644974,
      "epoch": 0.05734136174154701,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07267358899116516,
      "kl": 0.0013592233299277723,
      "learning_rate": 9.885409911996294e-07,
      "loss": 0.0104,
      "num_tokens": 34144075.0,
      "reward": 0.8419013023376465,
      "reward_std": 0.23843029141426086,
      "rewards/reward_func/mean": 0.8419013023376465,
      "rewards/reward_func/std": 0.23843029141426086,
      "step": 1238,
      "step_time": 15.847119845449924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 126.375,
      "completions/mean_terminated_length": 126.375,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2643110007047653,
      "epoch": 0.057387679481241315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016059990739449859,
      "kl": 0.0014530277985613793,
      "learning_rate": 9.885317276516906e-07,
      "loss": 0.0001,
      "num_tokens": 34164225.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1239,
      "step_time": 13.541017275303602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 138.8125,
      "completions/mean_terminated_length": 138.8125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3196500539779663,
      "epoch": 0.05743399722093562,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009587425156496465,
      "kl": 0.0013537496561184525,
      "learning_rate": 9.885224641037517e-07,
      "loss": 0.0001,
      "num_tokens": 34185982.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1240,
      "step_time": 14.922610383480787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 170.0,
      "completions/mean_terminated_length": 170.0,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.4293387234210968,
      "epoch": 0.05748031496062992,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001246821484528482,
      "kl": 0.001713241741526872,
      "learning_rate": 9.885132005558128e-07,
      "loss": 0.0001,
      "num_tokens": 34237198.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1241,
      "step_time": 25.724534645676613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 182.5,
      "completions/mean_terminated_length": 182.5,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.38223453611135483,
      "epoch": 0.05752663270032422,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012479997240006924,
      "kl": 0.001559621945489198,
      "learning_rate": 9.88503937007874e-07,
      "loss": 0.0001,
      "num_tokens": 34263798.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1242,
      "step_time": 19.806608445942402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 207.1875,
      "completions/mean_terminated_length": 207.1875,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.26252343505620956,
      "epoch": 0.057572950440018526,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08640774339437485,
      "kl": 0.0012831655621994287,
      "learning_rate": 9.88494673459935e-07,
      "loss": -0.0306,
      "num_tokens": 34301817.0,
      "reward": 0.8629425764083862,
      "reward_std": 0.3384329676628113,
      "rewards/reward_func/mean": 0.8629425764083862,
      "rewards/reward_func/std": 0.3384329676628113,
      "step": 1243,
      "step_time": 24.460368610918522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 200.6875,
      "completions/mean_terminated_length": 200.6875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.42235323786735535,
      "epoch": 0.05761926817971283,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008627405040897429,
      "kl": 0.0012830222549382597,
      "learning_rate": 9.884854099119962e-07,
      "loss": 0.0001,
      "num_tokens": 34324436.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1244,
      "step_time": 20.84190797433257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 175.4375,
      "completions/mean_terminated_length": 175.4375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.15327590703964233,
      "epoch": 0.05766558591940713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006670938455499709,
      "kl": 0.0006721837999066338,
      "learning_rate": 9.884761463640573e-07,
      "loss": 0.0,
      "num_tokens": 34369067.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 1245,
      "step_time": 23.57464948296547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 123.5,
      "completions/mean_terminated_length": 123.5,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.24939283728599548,
      "epoch": 0.057711903659101435,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009782507549971342,
      "kl": 0.0010339748696424067,
      "learning_rate": 9.884668828161187e-07,
      "loss": 0.0001,
      "num_tokens": 34390675.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1246,
      "step_time": 15.103761825710535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 108.875,
      "completions/mean_terminated_length": 108.875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.26170995458960533,
      "epoch": 0.05775822139879574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014053724007681012,
      "kl": 0.0011575254320632666,
      "learning_rate": 9.884576192681798e-07,
      "loss": 0.0001,
      "num_tokens": 34410673.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1247,
      "step_time": 12.882812615484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 223.0,
      "completions/mean_terminated_length": 223.0,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.16441050171852112,
      "epoch": 0.05780453913849004,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004503272648435086,
      "kl": 0.0005819772122777067,
      "learning_rate": 9.884483557202407e-07,
      "loss": 0.0,
      "num_tokens": 34435201.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1248,
      "step_time": 21.09315648302436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.36547720432281494,
      "epoch": 0.057850856878184344,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015186961973086,
      "kl": 0.001675381965469569,
      "learning_rate": 9.88439092172302e-07,
      "loss": 0.0001,
      "num_tokens": 34468457.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1249,
      "step_time": 20.333508122712374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 419.0,
      "completions/max_terminated_length": 419.0,
      "completions/mean_length": 263.8125,
      "completions/mean_terminated_length": 263.8125,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "entropy": 0.39689671248197556,
      "epoch": 0.05789717461787865,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06250195950269699,
      "kl": 0.0014216976414900273,
      "learning_rate": 9.884298286243632e-07,
      "loss": -0.0612,
      "num_tokens": 34496134.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 1250,
      "step_time": 34.22820543497801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 533.0,
      "completions/max_terminated_length": 533.0,
      "completions/mean_length": 241.6875,
      "completions/mean_terminated_length": 241.6875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.46069467067718506,
      "epoch": 0.05794349235757295,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05643896013498306,
      "kl": 0.0015777037187945098,
      "learning_rate": 9.884205650764243e-07,
      "loss": 0.0617,
      "num_tokens": 34535009.0,
      "reward": 0.00031467806547880173,
      "reward_std": 0.00018763775005936623,
      "rewards/reward_func/mean": 0.00031467806547880173,
      "rewards/reward_func/std": 0.00018763776461128145,
      "step": 1251,
      "step_time": 45.43677279353142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.20259954035282135,
      "epoch": 0.05798981009726725,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09602506458759308,
      "kl": 0.0007747110503260046,
      "learning_rate": 9.884113015284854e-07,
      "loss": 0.0285,
      "num_tokens": 34565969.0,
      "reward": 0.9419240951538086,
      "reward_std": 0.018595725297927856,
      "rewards/reward_func/mean": 0.9419240951538086,
      "rewards/reward_func/std": 0.018595723435282707,
      "step": 1252,
      "step_time": 20.653414957225323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4027465134859085,
      "epoch": 0.058036127836961555,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010446161031723022,
      "kl": 0.0013756774715147913,
      "learning_rate": 9.884020379805465e-07,
      "loss": 0.0001,
      "num_tokens": 34614959.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1253,
      "step_time": 23.52247118204832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 163.9375,
      "completions/mean_terminated_length": 163.9375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.22748686373233795,
      "epoch": 0.05808244557665586,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07628460973501205,
      "kl": 0.0014305194199550897,
      "learning_rate": 9.883927744326077e-07,
      "loss": -0.0661,
      "num_tokens": 34636654.0,
      "reward": 0.9226803779602051,
      "reward_std": 0.03836125135421753,
      "rewards/reward_func/mean": 0.9226803779602051,
      "rewards/reward_func/std": 0.038361258804798126,
      "step": 1254,
      "step_time": 18.081256940960884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 138.5625,
      "completions/mean_terminated_length": 138.5625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3049306273460388,
      "epoch": 0.05812876331635016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011182550806552172,
      "kl": 0.001401193527271971,
      "learning_rate": 9.883835108846688e-07,
      "loss": 0.0001,
      "num_tokens": 34658519.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1255,
      "step_time": 15.809340998530388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 345.0,
      "completions/max_terminated_length": 345.0,
      "completions/mean_length": 272.625,
      "completions/mean_terminated_length": 272.625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2618870660662651,
      "epoch": 0.058175081056044464,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07970226556062698,
      "kl": 0.0016272406501229852,
      "learning_rate": 9.8837424733673e-07,
      "loss": -0.1783,
      "num_tokens": 34695281.0,
      "reward": 0.5085287094116211,
      "reward_std": 0.42029711604118347,
      "rewards/reward_func/mean": 0.5085287094116211,
      "rewards/reward_func/std": 0.42029711604118347,
      "step": 1256,
      "step_time": 31.63137638568878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 142.0625,
      "completions/mean_terminated_length": 142.0625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3319794610142708,
      "epoch": 0.05822139879573877,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012234391178935766,
      "kl": 0.001421384746208787,
      "learning_rate": 9.88364983788791e-07,
      "loss": 0.0001,
      "num_tokens": 34725746.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1257,
      "step_time": 17.646457955241203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 128.75,
      "completions/mean_terminated_length": 128.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2638702392578125,
      "epoch": 0.05826771653543307,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016106648836284876,
      "kl": 0.0017507765733171254,
      "learning_rate": 9.883557202408522e-07,
      "loss": 0.0001,
      "num_tokens": 34761598.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1258,
      "step_time": 17.709375075995922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 114.875,
      "completions/mean_terminated_length": 114.875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.26288048177957535,
      "epoch": 0.05831403427512737,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014916558284312487,
      "kl": 0.0014745851221960038,
      "learning_rate": 9.883464566929135e-07,
      "loss": 0.0001,
      "num_tokens": 34781132.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1259,
      "step_time": 13.982169389724731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 192.5625,
      "completions/mean_terminated_length": 192.5625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.4385821372270584,
      "epoch": 0.058360352014821676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002485638251528144,
      "kl": 0.0021656985045410693,
      "learning_rate": 9.883371931449744e-07,
      "loss": 0.0001,
      "num_tokens": 34807061.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1260,
      "step_time": 20.581169545650482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 163.9375,
      "completions/mean_terminated_length": 163.9375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.21002722531557083,
      "epoch": 0.05840666975451598,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1248169094324112,
      "kl": 0.001338052112259902,
      "learning_rate": 9.883279295970355e-07,
      "loss": -0.1039,
      "num_tokens": 34830820.0,
      "reward": 0.2829582095146179,
      "reward_std": 0.2080743908882141,
      "rewards/reward_func/mean": 0.2829582095146179,
      "rewards/reward_func/std": 0.2080743908882141,
      "step": 1261,
      "step_time": 19.391571924090385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 181.5625,
      "completions/mean_terminated_length": 181.5625,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.40674079954624176,
      "epoch": 0.05845298749421028,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011472152546048164,
      "kl": 0.0013039868790656328,
      "learning_rate": 9.883186660490967e-07,
      "loss": 0.0001,
      "num_tokens": 34871181.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1262,
      "step_time": 24.397856388241053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 193.5625,
      "completions/mean_terminated_length": 193.5625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.24332686886191368,
      "epoch": 0.058499305233904585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013939500786364079,
      "kl": 0.0011751919810194522,
      "learning_rate": 9.88309402501158e-07,
      "loss": 0.0001,
      "num_tokens": 34893286.0,
      "reward": 0.7295533418655396,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7295533418655396,
      "rewards/reward_func/std": 0.0,
      "step": 1263,
      "step_time": 20.54952061548829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 128.375,
      "completions/mean_terminated_length": 128.375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.29128848016262054,
      "epoch": 0.05854562297359889,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011062759440392256,
      "kl": 0.0012307801225688308,
      "learning_rate": 9.883001389532191e-07,
      "loss": 0.0001,
      "num_tokens": 34914300.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1264,
      "step_time": 15.922392208129168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 228.375,
      "completions/mean_terminated_length": 228.375,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "entropy": 0.15263668820261955,
      "epoch": 0.05859194071329319,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011531964410096407,
      "kl": 0.0007745838302071206,
      "learning_rate": 9.882908754052802e-07,
      "loss": 0.0,
      "num_tokens": 34939058.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1265,
      "step_time": 22.780731935054064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 189.1875,
      "completions/mean_terminated_length": 189.1875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.21471868455410004,
      "epoch": 0.05863825845298749,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006897761486470699,
      "kl": 0.0007938119524624199,
      "learning_rate": 9.882816118573414e-07,
      "loss": 0.0,
      "num_tokens": 34981989.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 1266,
      "step_time": 24.1014525257051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 168.125,
      "completions/mean_terminated_length": 168.125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.2848397567868233,
      "epoch": 0.058684576192681796,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11768696457147598,
      "kl": 0.0012865164317190647,
      "learning_rate": 9.882723483094025e-07,
      "loss": -0.0419,
      "num_tokens": 35004679.0,
      "reward": 0.2808825373649597,
      "reward_std": 0.018500691279768944,
      "rewards/reward_func/mean": 0.2808825373649597,
      "rewards/reward_func/std": 0.018500693142414093,
      "step": 1267,
      "step_time": 19.55908903107047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 127.0,
      "completions/mean_terminated_length": 127.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2190609648823738,
      "epoch": 0.0587308939323761,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008628237410448492,
      "kl": 0.0010214103240286931,
      "learning_rate": 9.882630847614636e-07,
      "loss": 0.0001,
      "num_tokens": 35024327.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1268,
      "step_time": 13.416114680469036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 361.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 256.4375,
      "completions/mean_terminated_length": 256.4375,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.1592147834599018,
      "epoch": 0.0587772116720704,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.057634301483631134,
      "kl": 0.001346321965684183,
      "learning_rate": 9.882538212135247e-07,
      "loss": 0.0037,
      "num_tokens": 35056590.0,
      "reward": 0.8999221324920654,
      "reward_std": 0.16143766045570374,
      "rewards/reward_func/mean": 0.8999221324920654,
      "rewards/reward_func/std": 0.16143766045570374,
      "step": 1269,
      "step_time": 30.925078090280294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 206.875,
      "completions/mean_terminated_length": 206.875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.32576390355825424,
      "epoch": 0.058823529411764705,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07088025659322739,
      "kl": 0.0012108280207030475,
      "learning_rate": 9.882445576655859e-07,
      "loss": -0.0892,
      "num_tokens": 35078380.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 1270,
      "step_time": 21.85751686245203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 117.6875,
      "completions/mean_terminated_length": 117.6875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.28740938007831573,
      "epoch": 0.05886984715145901,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008541119168512523,
      "kl": 0.0011367111001163721,
      "learning_rate": 9.88235294117647e-07,
      "loss": 0.0001,
      "num_tokens": 35101287.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1271,
      "step_time": 13.903431259095669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 125.75,
      "completions/mean_terminated_length": 125.75,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.322317399084568,
      "epoch": 0.05891616489115331,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003119006287306547,
      "kl": 0.0015256724145729095,
      "learning_rate": 9.882260305697081e-07,
      "loss": 0.0001,
      "num_tokens": 35121555.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1272,
      "step_time": 13.342403680086136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 165.25,
      "completions/mean_terminated_length": 165.25,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.37049461901187897,
      "epoch": 0.058962482630847614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012520358432084322,
      "kl": 0.0014613436069339514,
      "learning_rate": 9.882167670217692e-07,
      "loss": 0.0001,
      "num_tokens": 35152551.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1273,
      "step_time": 19.7287641428411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 169.375,
      "completions/mean_terminated_length": 169.375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3411256819963455,
      "epoch": 0.05900880037054192,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10679585486650467,
      "kl": 0.001722003478789702,
      "learning_rate": 9.882075034738304e-07,
      "loss": -0.0156,
      "num_tokens": 35174957.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 1274,
      "step_time": 17.63904182612896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 178.0,
      "completions/mean_terminated_length": 178.0,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.23396510630846024,
      "epoch": 0.05905511811023622,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011064562713727355,
      "kl": 0.0009488572250120342,
      "learning_rate": 9.881982399258915e-07,
      "loss": 0.0,
      "num_tokens": 35195965.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1275,
      "step_time": 17.236826792359352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 147.875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.205683171749115,
      "epoch": 0.05910143584993052,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011295033618807793,
      "kl": 0.001086074553313665,
      "learning_rate": 9.881889763779528e-07,
      "loss": 0.0001,
      "num_tokens": 35227211.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 1276,
      "step_time": 18.04539056122303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 118.1875,
      "completions/mean_terminated_length": 118.1875,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.26655275374650955,
      "epoch": 0.059147753589624825,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017749374965205789,
      "kl": 0.0015684491663705558,
      "learning_rate": 9.88179712830014e-07,
      "loss": 0.0001,
      "num_tokens": 35250622.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1277,
      "step_time": 13.596657756716013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 154.3125,
      "completions/mean_terminated_length": 154.3125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3734418898820877,
      "epoch": 0.05919407132931913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002765932120382786,
      "kl": 0.002765625366009772,
      "learning_rate": 9.881704492820749e-07,
      "loss": 0.0001,
      "num_tokens": 35274563.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1278,
      "step_time": 17.611996166408062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 147.8125,
      "completions/mean_terminated_length": 147.8125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3976045846939087,
      "epoch": 0.05924038906901343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010244401637464762,
      "kl": 0.0012972986733075231,
      "learning_rate": 9.88161185734136e-07,
      "loss": 0.0001,
      "num_tokens": 35309248.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1279,
      "step_time": 19.944551046937704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 132.25,
      "completions/mean_terminated_length": 132.25,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3238281235098839,
      "epoch": 0.059286706808707734,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025384421460330486,
      "kl": 0.0015252811426762491,
      "learning_rate": 9.881519221861973e-07,
      "loss": 0.0001,
      "num_tokens": 35333140.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1280,
      "step_time": 16.57121830061078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 224.75,
      "completions/mean_terminated_length": 224.75,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.14412657544016838,
      "epoch": 0.05933302454840204,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005944859585724771,
      "kl": 0.0006109927489887923,
      "learning_rate": 9.881426586382584e-07,
      "loss": 0.0,
      "num_tokens": 35363456.0,
      "reward": 0.9622687101364136,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9622687101364136,
      "rewards/reward_func/std": 0.0,
      "step": 1281,
      "step_time": 23.807335074990988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 295.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 253.875,
      "completions/mean_terminated_length": 253.875,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "entropy": 0.18086369708180428,
      "epoch": 0.05937934228809634,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0869535431265831,
      "kl": 0.0009457567939534783,
      "learning_rate": 9.881333950903196e-07,
      "loss": -0.0278,
      "num_tokens": 35387310.0,
      "reward": 0.9387601017951965,
      "reward_std": 0.048821549862623215,
      "rewards/reward_func/mean": 0.9387601017951965,
      "rewards/reward_func/std": 0.048821575939655304,
      "step": 1282,
      "step_time": 25.28862490877509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 231.6875,
      "completions/mean_terminated_length": 231.6875,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "entropy": 0.14501139521598816,
      "epoch": 0.05942566002779064,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00144374358933419,
      "kl": 0.000982460391242057,
      "learning_rate": 9.881241315423807e-07,
      "loss": 0.0,
      "num_tokens": 35411833.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1283,
      "step_time": 22.92037371918559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 238.5,
      "completions/mean_terminated_length": 238.5,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.3042704053223133,
      "epoch": 0.059471977767484946,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06681200116872787,
      "kl": 0.001692799385637045,
      "learning_rate": 9.881148679944418e-07,
      "loss": -0.0392,
      "num_tokens": 35449601.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 1284,
      "step_time": 26.501106817275286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 126.4375,
      "completions/mean_terminated_length": 126.4375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2799791172146797,
      "epoch": 0.05951829550717925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001297675189562142,
      "kl": 0.0012755271745845675,
      "learning_rate": 9.88105604446503e-07,
      "loss": 0.0001,
      "num_tokens": 35469704.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1285,
      "step_time": 15.396445531398058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 115.9375,
      "completions/mean_terminated_length": 115.9375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.23795905709266663,
      "epoch": 0.05956461324687355,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011905039427801967,
      "kl": 0.00119230174459517,
      "learning_rate": 9.88096340898564e-07,
      "loss": 0.0001,
      "num_tokens": 35489623.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1286,
      "step_time": 12.979618959128857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 136.625,
      "completions/mean_terminated_length": 136.625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.26678039133548737,
      "epoch": 0.059610930986567855,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001481172046624124,
      "kl": 0.001021220610709861,
      "learning_rate": 9.880870773506252e-07,
      "loss": 0.0001,
      "num_tokens": 35509633.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1287,
      "step_time": 15.040730103850365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 130.8125,
      "completions/mean_terminated_length": 130.8125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.21279476955533028,
      "epoch": 0.05965724872626216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001137681887485087,
      "kl": 0.0009014675742946565,
      "learning_rate": 9.880778138026863e-07,
      "loss": 0.0,
      "num_tokens": 35531342.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1288,
      "step_time": 14.034538641571999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 126.75,
      "completions/mean_terminated_length": 126.75,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2391769178211689,
      "epoch": 0.05970356646595646,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014369251439347863,
      "kl": 0.0011704412463586777,
      "learning_rate": 9.880685502547477e-07,
      "loss": 0.0001,
      "num_tokens": 35550986.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1289,
      "step_time": 13.299801394343376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 214.625,
      "completions/mean_terminated_length": 214.625,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.1921832300722599,
      "epoch": 0.05974988420565076,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008338350453414023,
      "kl": 0.0008639673033030704,
      "learning_rate": 9.880592867068088e-07,
      "loss": 0.0,
      "num_tokens": 35576340.0,
      "reward": 0.7221074104309082,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7221074104309082,
      "rewards/reward_func/std": 0.0,
      "step": 1290,
      "step_time": 21.38058177381754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 124.1875,
      "completions/mean_terminated_length": 124.1875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.25062207877635956,
      "epoch": 0.059796201945345066,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013031045673415065,
      "kl": 0.0013300326390890405,
      "learning_rate": 9.880500231588697e-07,
      "loss": 0.0001,
      "num_tokens": 35596167.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1291,
      "step_time": 14.002082046121359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 158.5,
      "completions/mean_terminated_length": 158.5,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3573547676205635,
      "epoch": 0.05984251968503937,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002497861860319972,
      "kl": 0.0017638968129176646,
      "learning_rate": 9.880407596109308e-07,
      "loss": 0.0001,
      "num_tokens": 35616959.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1292,
      "step_time": 18.8273093290627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 190.5625,
      "completions/mean_terminated_length": 190.5625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.4123840406537056,
      "epoch": 0.05988883742473367,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001903548021800816,
      "kl": 0.001797056320356205,
      "learning_rate": 9.880314960629922e-07,
      "loss": 0.0001,
      "num_tokens": 35654136.0,
      "reward": 0.780767560005188,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.780767560005188,
      "rewards/reward_func/std": 0.0,
      "step": 1293,
      "step_time": 24.41575490310788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 131.375,
      "completions/mean_terminated_length": 131.375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2838471829891205,
      "epoch": 0.059935155164427975,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008910238975659013,
      "kl": 0.0011368074774509296,
      "learning_rate": 9.880222325150533e-07,
      "loss": 0.0001,
      "num_tokens": 35676110.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1294,
      "step_time": 14.732032056897879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 206.1875,
      "completions/mean_terminated_length": 206.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.25893597304821014,
      "epoch": 0.05998147290412228,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07036730647087097,
      "kl": 0.0012522713805083185,
      "learning_rate": 9.880129689671144e-07,
      "loss": -0.0349,
      "num_tokens": 35698065.0,
      "reward": 0.8293574452400208,
      "reward_std": 0.301551878452301,
      "rewards/reward_func/mean": 0.8293574452400208,
      "rewards/reward_func/std": 0.3015519082546234,
      "step": 1295,
      "step_time": 22.613749779760838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 142.3125,
      "completions/mean_terminated_length": 142.3125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3176952674984932,
      "epoch": 0.06002779064381658,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009578858152963221,
      "kl": 0.0009685322729637846,
      "learning_rate": 9.880037054191755e-07,
      "loss": 0.0,
      "num_tokens": 35731030.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1296,
      "step_time": 19.185499880462885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 288.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 210.625,
      "completions/mean_terminated_length": 210.625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.41428501158952713,
      "epoch": 0.060074108383510884,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001212472328916192,
      "kl": 0.0012661851360462606,
      "learning_rate": 9.879944418712367e-07,
      "loss": 0.0001,
      "num_tokens": 35768928.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1297,
      "step_time": 27.58428728580475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 196.1875,
      "completions/mean_terminated_length": 196.1875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.18155839666724205,
      "epoch": 0.06012042612320519,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11216012388467789,
      "kl": 0.0015206912066787481,
      "learning_rate": 9.879851783232978e-07,
      "loss": 0.0224,
      "num_tokens": 35792579.0,
      "reward": 0.9409475922584534,
      "reward_std": 0.06039387732744217,
      "rewards/reward_func/mean": 0.9409475922584534,
      "rewards/reward_func/std": 0.06039387360215187,
      "step": 1298,
      "step_time": 21.5261762291193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 203.125,
      "completions/mean_terminated_length": 203.125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.25570859387516975,
      "epoch": 0.06016674386289949,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10892330855131149,
      "kl": 0.0020363611401990056,
      "learning_rate": 9.87975914775359e-07,
      "loss": -0.0393,
      "num_tokens": 35830133.0,
      "reward": 0.7691745162010193,
      "reward_std": 0.16072623431682587,
      "rewards/reward_func/mean": 0.7691745162010193,
      "rewards/reward_func/std": 0.16072624921798706,
      "step": 1299,
      "step_time": 23.691280510276556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 269.75,
      "completions/mean_terminated_length": 269.75,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "entropy": 0.22463082149624825,
      "epoch": 0.06021306160259379,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05471609905362129,
      "kl": 0.001213410694617778,
      "learning_rate": 9.8796665122742e-07,
      "loss": 0.0071,
      "num_tokens": 35859425.0,
      "reward": 0.98279869556427,
      "reward_std": 0.004587030503898859,
      "rewards/reward_func/mean": 0.98279869556427,
      "rewards/reward_func/std": 0.004587024915963411,
      "step": 1300,
      "step_time": 26.6886116117239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 163.375,
      "completions/mean_terminated_length": 163.375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3098113238811493,
      "epoch": 0.060259379342288096,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09868577122688293,
      "kl": 0.001806522865081206,
      "learning_rate": 9.879573876794812e-07,
      "loss": -0.0978,
      "num_tokens": 35880999.0,
      "reward": 0.19428689777851105,
      "reward_std": 0.3475509583950043,
      "rewards/reward_func/mean": 0.19428689777851105,
      "rewards/reward_func/std": 0.34755098819732666,
      "step": 1301,
      "step_time": 17.72952525690198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 141.8125,
      "completions/mean_terminated_length": 141.8125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3010546714067459,
      "epoch": 0.0603056970819824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011427297722548246,
      "kl": 0.0011611956870183349,
      "learning_rate": 9.879481241315423e-07,
      "loss": 0.0001,
      "num_tokens": 35902020.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1302,
      "step_time": 15.329928517341614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 145.1875,
      "completions/mean_terminated_length": 145.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3341658264398575,
      "epoch": 0.0603520148216767,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009805286535993218,
      "kl": 0.0010981812374666333,
      "learning_rate": 9.879388605836034e-07,
      "loss": 0.0001,
      "num_tokens": 35937991.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1303,
      "step_time": 19.20741555839777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 228.25,
      "completions/mean_terminated_length": 228.25,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.2592865750193596,
      "epoch": 0.060398332561371004,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001082071801647544,
      "kl": 0.001120303408242762,
      "learning_rate": 9.879295970356645e-07,
      "loss": 0.0001,
      "num_tokens": 35963131.0,
      "reward": 0.8243306875228882,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8243306875228882,
      "rewards/reward_func/std": 0.0,
      "step": 1304,
      "step_time": 22.70706956088543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 166.9375,
      "completions/mean_terminated_length": 166.9375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.19587522745132446,
      "epoch": 0.06044465030106531,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007689369958825409,
      "kl": 0.0009125641372520477,
      "learning_rate": 9.879203334877257e-07,
      "loss": 0.0,
      "num_tokens": 35997546.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 1305,
      "step_time": 19.82010806724429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 226.3125,
      "completions/mean_terminated_length": 226.3125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.2953353226184845,
      "epoch": 0.06049096804075961,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001081167720258236,
      "kl": 0.0009990074031520635,
      "learning_rate": 9.87911069939787e-07,
      "loss": 0.0001,
      "num_tokens": 36024911.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1306,
      "step_time": 23.03389771655202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 160.4375,
      "completions/mean_terminated_length": 160.4375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.20224759727716446,
      "epoch": 0.06053728578045391,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1349053829908371,
      "kl": 0.001746593916323036,
      "learning_rate": 9.879018063918481e-07,
      "loss": -0.0521,
      "num_tokens": 36047238.0,
      "reward": 0.6257256269454956,
      "reward_std": 0.10125773400068283,
      "rewards/reward_func/mean": 0.6257256269454956,
      "rewards/reward_func/std": 0.10125772655010223,
      "step": 1307,
      "step_time": 17.689881186932325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 179.5625,
      "completions/mean_terminated_length": 179.5625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.39270056784152985,
      "epoch": 0.060583603520148216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011465889401733875,
      "kl": 0.0012371263874229044,
      "learning_rate": 9.878925428439092e-07,
      "loss": 0.0001,
      "num_tokens": 36076975.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1308,
      "step_time": 20.26331490278244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 172.0,
      "completions/mean_terminated_length": 172.0,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.4041343480348587,
      "epoch": 0.06062992125984252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011341717327013612,
      "kl": 0.001408853247994557,
      "learning_rate": 9.878832792959702e-07,
      "loss": 0.0001,
      "num_tokens": 36102255.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1309,
      "step_time": 20.550407517701387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 133.0625,
      "completions/mean_terminated_length": 133.0625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.22065860033035278,
      "epoch": 0.06067623899953682,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000924820953514427,
      "kl": 0.0011635018163360655,
      "learning_rate": 9.878740157480315e-07,
      "loss": 0.0001,
      "num_tokens": 36122208.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1310,
      "step_time": 14.049900580197573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 225.8125,
      "completions/mean_terminated_length": 225.8125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.25286129862070084,
      "epoch": 0.060722556739231125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001199951395392418,
      "kl": 0.0012331777979852632,
      "learning_rate": 9.878647522000926e-07,
      "loss": 0.0001,
      "num_tokens": 36160285.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1311,
      "step_time": 25.68111901730299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 129.1875,
      "completions/mean_terminated_length": 129.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.28861740976572037,
      "epoch": 0.06076887447892543,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007949213613756001,
      "kl": 0.0009198406187351793,
      "learning_rate": 9.878554886521537e-07,
      "loss": 0.0,
      "num_tokens": 36188192.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1312,
      "step_time": 15.98707052692771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 130.125,
      "completions/mean_terminated_length": 130.125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.32708143442869186,
      "epoch": 0.06081519221861973,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011910056928172708,
      "kl": 0.0015137761947698891,
      "learning_rate": 9.878462251042149e-07,
      "loss": 0.0001,
      "num_tokens": 36209762.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1313,
      "step_time": 15.773848168551922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.30693046003580093,
      "epoch": 0.06086150995831403,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016096553299576044,
      "kl": 0.0012392305507091805,
      "learning_rate": 9.87836961556276e-07,
      "loss": 0.0001,
      "num_tokens": 36239667.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1314,
      "step_time": 17.582204215228558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 186.125,
      "completions/mean_terminated_length": 186.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.44022227823734283,
      "epoch": 0.060907827698008336,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009159357869066298,
      "kl": 0.0014115108933765441,
      "learning_rate": 9.878276980083371e-07,
      "loss": 0.0001,
      "num_tokens": 36264693.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1315,
      "step_time": 20.20029328763485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 316.0,
      "completions/max_terminated_length": 316.0,
      "completions/mean_length": 231.0,
      "completions/mean_terminated_length": 231.0,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.40278957784175873,
      "epoch": 0.06095414543770264,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09605013579130173,
      "kl": 0.0016143825778272003,
      "learning_rate": 9.878184344603982e-07,
      "loss": -0.1135,
      "num_tokens": 36287765.0,
      "reward": 0.09924251586198807,
      "reward_std": 0.26821577548980713,
      "rewards/reward_func/mean": 0.09924251586198807,
      "rewards/reward_func/std": 0.2682158052921295,
      "step": 1316,
      "step_time": 27.463576547801495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 199.5,
      "completions/mean_terminated_length": 199.5,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.20220749825239182,
      "epoch": 0.06100046317739694,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005912294727750123,
      "kl": 0.0007472722791135311,
      "learning_rate": 9.878091709124594e-07,
      "loss": 0.0,
      "num_tokens": 36311837.0,
      "reward": 0.9534969329833984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9534969329833984,
      "rewards/reward_func/std": 0.0,
      "step": 1317,
      "step_time": 24.03943707793951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 171.9375,
      "completions/mean_terminated_length": 171.9375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.22347812727093697,
      "epoch": 0.061046780917091245,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08405770361423492,
      "kl": 0.0007646936719538644,
      "learning_rate": 9.877999073645205e-07,
      "loss": -0.0184,
      "num_tokens": 36340172.0,
      "reward": 0.8599967956542969,
      "reward_std": 0.08930132538080215,
      "rewards/reward_func/mean": 0.8599967956542969,
      "rewards/reward_func/std": 0.08930133283138275,
      "step": 1318,
      "step_time": 19.7347002774477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.30847054719924927,
      "epoch": 0.06109309865678555,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024909113999456167,
      "kl": 0.0015048130590002984,
      "learning_rate": 9.877906438165818e-07,
      "loss": 0.0001,
      "num_tokens": 36360890.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1319,
      "step_time": 19.199145317077637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 117.1875,
      "completions/mean_terminated_length": 117.1875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.2440360188484192,
      "epoch": 0.06113941639647985,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012768898159265518,
      "kl": 0.0012907666387036443,
      "learning_rate": 9.87781380268643e-07,
      "loss": 0.0001,
      "num_tokens": 36384269.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1320,
      "step_time": 13.399408016353846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 126.5,
      "completions/mean_terminated_length": 126.5,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.3299025222659111,
      "epoch": 0.061185734136174154,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002699008211493492,
      "kl": 0.0015059859433677047,
      "learning_rate": 9.877721167207039e-07,
      "loss": 0.0001,
      "num_tokens": 36418437.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1321,
      "step_time": 18.462588392198086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 228.0,
      "completions/mean_terminated_length": 228.0,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.25493139028549194,
      "epoch": 0.06123205187586846,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10637736320495605,
      "kl": 0.0016697085520718247,
      "learning_rate": 9.87762853172765e-07,
      "loss": -0.0492,
      "num_tokens": 36443637.0,
      "reward": 0.6062300205230713,
      "reward_std": 0.02733754739165306,
      "rewards/reward_func/mean": 0.6062300205230713,
      "rewards/reward_func/std": 0.027337554842233658,
      "step": 1322,
      "step_time": 24.628826271742582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3845409229397774,
      "epoch": 0.06127836961556276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013907892862334847,
      "kl": 0.001733310054987669,
      "learning_rate": 9.877535896248263e-07,
      "loss": 0.0001,
      "num_tokens": 36480532.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1323,
      "step_time": 20.58817472308874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 126.3125,
      "completions/mean_terminated_length": 126.3125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.34343016892671585,
      "epoch": 0.06132468735525706,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020387242548167706,
      "kl": 0.0015359602111857384,
      "learning_rate": 9.877443260768875e-07,
      "loss": 0.0001,
      "num_tokens": 36501257.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1324,
      "step_time": 13.899054154753685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.329635426402092,
      "epoch": 0.061371005094951366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009411834180355072,
      "kl": 0.0012850109487771988,
      "learning_rate": 9.877350625289486e-07,
      "loss": 0.0001,
      "num_tokens": 36523564.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1325,
      "step_time": 18.96903756260872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 173.625,
      "completions/mean_terminated_length": 173.625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.379032664000988,
      "epoch": 0.06141732283464567,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011547692120075226,
      "kl": 0.001265781669644639,
      "learning_rate": 9.877257989810097e-07,
      "loss": 0.0001,
      "num_tokens": 36546198.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1326,
      "step_time": 19.25311341881752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 116.125,
      "completions/mean_terminated_length": 116.125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.30368292331695557,
      "epoch": 0.06146364057433997,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000730156316421926,
      "kl": 0.0010984012042172253,
      "learning_rate": 9.877165354330708e-07,
      "loss": 0.0001,
      "num_tokens": 36567672.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1327,
      "step_time": 13.52169605344534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2857901155948639,
      "epoch": 0.061509958314034274,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018799130339175463,
      "kl": 0.001398787455400452,
      "learning_rate": 9.87707271885132e-07,
      "loss": 0.0001,
      "num_tokens": 36588944.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1328,
      "step_time": 15.058954171836376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 145.5625,
      "completions/mean_terminated_length": 145.5625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.32882945239543915,
      "epoch": 0.06155627605372858,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016379902372136712,
      "kl": 0.0013487855030689389,
      "learning_rate": 9.87698008337193e-07,
      "loss": 0.0001,
      "num_tokens": 36622473.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1329,
      "step_time": 18.894803293049335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 177.4375,
      "completions/mean_terminated_length": 177.4375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4255825951695442,
      "epoch": 0.06160259379342288,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012570770923048258,
      "kl": 0.002042806096142158,
      "learning_rate": 9.876887447892542e-07,
      "loss": 0.0001,
      "num_tokens": 36673488.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1330,
      "step_time": 26.708086907863617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 205.25,
      "completions/mean_terminated_length": 205.25,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.45978182554244995,
      "epoch": 0.06164891153311718,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09499292820692062,
      "kl": 0.0015033484087325633,
      "learning_rate": 9.876794812413153e-07,
      "loss": 0.0735,
      "num_tokens": 36695524.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1331,
      "step_time": 23.335980210453272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 198.1875,
      "completions/mean_terminated_length": 198.1875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2236262559890747,
      "epoch": 0.061695229272811486,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07165606319904327,
      "kl": 0.001192555035231635,
      "learning_rate": 9.876702176933765e-07,
      "loss": -0.0652,
      "num_tokens": 36718039.0,
      "reward": 0.4348215162754059,
      "reward_std": 0.18556353449821472,
      "rewards/reward_func/mean": 0.4348215162754059,
      "rewards/reward_func/std": 0.18556353449821472,
      "step": 1332,
      "step_time": 21.416344843804836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.298039510846138,
      "epoch": 0.06174154701250579,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001903527183458209,
      "kl": 0.0014055129431653768,
      "learning_rate": 9.876609541454378e-07,
      "loss": 0.0001,
      "num_tokens": 36739675.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1333,
      "step_time": 15.189780503511429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 181.3125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.29572784155607224,
      "epoch": 0.06178786475220009,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08701220899820328,
      "kl": 0.0014234512636903673,
      "learning_rate": 9.876516905974987e-07,
      "loss": 0.0053,
      "num_tokens": 36763840.0,
      "reward": 0.009960266761481762,
      "reward_std": 0.009204850532114506,
      "rewards/reward_func/mean": 0.009960266761481762,
      "rewards/reward_func/std": 0.00920485146343708,
      "step": 1334,
      "step_time": 21.215655487030745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.20869983360171318,
      "epoch": 0.061834182491894395,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06351931393146515,
      "kl": 0.0010749039647635072,
      "learning_rate": 9.876424270495598e-07,
      "loss": -0.0059,
      "num_tokens": 36801148.0,
      "reward": 0.9219220876693726,
      "reward_std": 0.02082076109945774,
      "rewards/reward_func/mean": 0.9219220876693726,
      "rewards/reward_func/std": 0.02082076668739319,
      "step": 1335,
      "step_time": 21.6966012082994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 125.1875,
      "completions/mean_terminated_length": 125.1875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3079006224870682,
      "epoch": 0.0618805002315887,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007926784455776215,
      "kl": 0.0009599182958481833,
      "learning_rate": 9.876331635016212e-07,
      "loss": 0.0,
      "num_tokens": 36822591.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1336,
      "step_time": 14.76633208990097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 224.1875,
      "completions/mean_terminated_length": 224.1875,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.2204713374376297,
      "epoch": 0.061926817971283,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004801633767783642,
      "kl": 0.000696543458616361,
      "learning_rate": 9.876238999536823e-07,
      "loss": 0.0,
      "num_tokens": 36854370.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 1337,
      "step_time": 24.375063110142946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 308.0,
      "completions/max_terminated_length": 308.0,
      "completions/mean_length": 235.75,
      "completions/mean_terminated_length": 235.75,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.4921463578939438,
      "epoch": 0.0619731357109773,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07507622241973877,
      "kl": 0.0015840266714803874,
      "learning_rate": 9.876146364057434e-07,
      "loss": 0.1382,
      "num_tokens": 36886046.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 1338,
      "step_time": 28.953611817210913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 159.375,
      "completions/mean_terminated_length": 159.375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.2921140268445015,
      "epoch": 0.062019453450671606,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016031945124268532,
      "kl": 0.0011878642835654318,
      "learning_rate": 9.876053728578045e-07,
      "loss": 0.0001,
      "num_tokens": 36909492.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1339,
      "step_time": 17.372348058968782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 133.5,
      "completions/mean_terminated_length": 133.5,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.31634822487831116,
      "epoch": 0.06206577119036591,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011940133990719914,
      "kl": 0.001183247019071132,
      "learning_rate": 9.875961093098657e-07,
      "loss": 0.0001,
      "num_tokens": 36931660.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1340,
      "step_time": 14.516185022890568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 115.0,
      "completions/mean_terminated_length": 115.0,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2151433601975441,
      "epoch": 0.06211208893006021,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002757185371592641,
      "kl": 0.0011798940249718726,
      "learning_rate": 9.875868457619268e-07,
      "loss": 0.0001,
      "num_tokens": 36951132.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1341,
      "step_time": 13.34553236886859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 342.0,
      "completions/max_terminated_length": 342.0,
      "completions/mean_length": 287.5625,
      "completions/mean_terminated_length": 287.5625,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "entropy": 0.3032371699810028,
      "epoch": 0.062158406669754515,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05955071002244949,
      "kl": 0.0010868890822166577,
      "learning_rate": 9.87577582213988e-07,
      "loss": -0.0221,
      "num_tokens": 36990805.0,
      "reward": 0.43345510959625244,
      "reward_std": 0.11558802425861359,
      "rewards/reward_func/mean": 0.43345510959625244,
      "rewards/reward_func/std": 0.11558802425861359,
      "step": 1342,
      "step_time": 32.559711404144764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 196.8125,
      "completions/mean_terminated_length": 196.8125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.22023657709360123,
      "epoch": 0.06220472440944882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06820940971374512,
      "kl": 0.0012514598201960325,
      "learning_rate": 9.87568318666049e-07,
      "loss": -0.0263,
      "num_tokens": 37015586.0,
      "reward": 0.9275899529457092,
      "reward_std": 0.035925447940826416,
      "rewards/reward_func/mean": 0.9275899529457092,
      "rewards/reward_func/std": 0.03592545539140701,
      "step": 1343,
      "step_time": 20.035180181264877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 205.375,
      "completions/mean_terminated_length": 205.375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.1878780871629715,
      "epoch": 0.06225104214914312,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06910555064678192,
      "kl": 0.0008483437850372866,
      "learning_rate": 9.875590551181102e-07,
      "loss": 0.0081,
      "num_tokens": 37041144.0,
      "reward": 0.9754081964492798,
      "reward_std": 0.09836733341217041,
      "rewards/reward_func/mean": 0.9754081964492798,
      "rewards/reward_func/std": 0.09836734086275101,
      "step": 1344,
      "step_time": 20.38943938910961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 178.25,
      "completions/mean_terminated_length": 178.25,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.22444728761911392,
      "epoch": 0.062297359888837424,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001037672394886613,
      "kl": 0.0008089821494650096,
      "learning_rate": 9.875497915701713e-07,
      "loss": 0.0,
      "num_tokens": 37065548.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 1345,
      "step_time": 18.363298401236534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 112.5625,
      "completions/mean_terminated_length": 112.5625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.297483429312706,
      "epoch": 0.06234367762853173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014063261914998293,
      "kl": 0.0014439236838370562,
      "learning_rate": 9.875405280222324e-07,
      "loss": 0.0001,
      "num_tokens": 37086277.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1346,
      "step_time": 12.818291902542114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 169.5625,
      "completions/mean_terminated_length": 169.5625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.20191585645079613,
      "epoch": 0.06238999536822603,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004159863572567701,
      "kl": 0.0018049118225462735,
      "learning_rate": 9.875312644742935e-07,
      "loss": 0.0001,
      "num_tokens": 37108366.0,
      "reward": 0.8205257058143616,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8205257058143616,
      "rewards/reward_func/std": 0.0,
      "step": 1347,
      "step_time": 17.730277586728334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 180.6875,
      "completions/mean_terminated_length": 180.6875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.21539785712957382,
      "epoch": 0.06243631310792033,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09756869822740555,
      "kl": 0.0016226680018007755,
      "learning_rate": 9.875220009263547e-07,
      "loss": 0.0179,
      "num_tokens": 37131433.0,
      "reward": 0.9275797009468079,
      "reward_std": 0.04178958758711815,
      "rewards/reward_func/mean": 0.9275797009468079,
      "rewards/reward_func/std": 0.041789598762989044,
      "step": 1348,
      "step_time": 18.883345417678356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 275.0625,
      "completions/mean_terminated_length": 275.0625,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "entropy": 0.31261318176984787,
      "epoch": 0.062482630847614636,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06316092610359192,
      "kl": 0.0017653298273216933,
      "learning_rate": 9.875127373784158e-07,
      "loss": -0.1469,
      "num_tokens": 37169402.0,
      "reward": 0.4761143624782562,
      "reward_std": 0.3884202241897583,
      "rewards/reward_func/mean": 0.4761143624782562,
      "rewards/reward_func/std": 0.3884202241897583,
      "step": 1349,
      "step_time": 36.37331370264292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 138.9375,
      "completions/mean_terminated_length": 138.9375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3901263177394867,
      "epoch": 0.06252894858730894,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005902454722672701,
      "kl": 0.002515608735848218,
      "learning_rate": 9.875034738304771e-07,
      "loss": 0.0001,
      "num_tokens": 37191545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1350,
      "step_time": 15.089044328778982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 217.125,
      "completions/mean_terminated_length": 217.125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2958965077996254,
      "epoch": 0.06257526632700325,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08561558276414871,
      "kl": 0.0015522035246249288,
      "learning_rate": 9.874942102825383e-07,
      "loss": -0.0959,
      "num_tokens": 37213915.0,
      "reward": 0.40241771936416626,
      "reward_std": 0.3842158317565918,
      "rewards/reward_func/mean": 0.40241771936416626,
      "rewards/reward_func/std": 0.3842158317565918,
      "step": 1351,
      "step_time": 22.39298490062356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 160.75,
      "completions/mean_terminated_length": 160.75,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.3948901891708374,
      "epoch": 0.06262158406669754,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008058904786594212,
      "kl": 0.0014029656304046512,
      "learning_rate": 9.874849467345992e-07,
      "loss": 0.0001,
      "num_tokens": 37248631.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1352,
      "step_time": 20.79153921827674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 149.5,
      "completions/mean_terminated_length": 149.5,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.2936493381857872,
      "epoch": 0.06266790180639185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019970559515058994,
      "kl": 0.0013638751115649939,
      "learning_rate": 9.874756831866605e-07,
      "loss": 0.0001,
      "num_tokens": 37270063.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1353,
      "step_time": 16.7253421805799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 135.1875,
      "completions/mean_terminated_length": 135.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3047530725598335,
      "epoch": 0.06271421954608615,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002005633432418108,
      "kl": 0.0013498670450644568,
      "learning_rate": 9.874664196387216e-07,
      "loss": 0.0001,
      "num_tokens": 37297634.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1354,
      "step_time": 16.681302469223738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 145.625,
      "completions/mean_terminated_length": 145.625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3298976346850395,
      "epoch": 0.06276053728578046,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014601984294131398,
      "kl": 0.0017668300424702466,
      "learning_rate": 9.874571560907828e-07,
      "loss": 0.0001,
      "num_tokens": 37328252.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1355,
      "step_time": 17.33435459434986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.14845091477036476,
      "epoch": 0.06280685502547476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007003385107964277,
      "kl": 0.0007584914710605517,
      "learning_rate": 9.874478925428439e-07,
      "loss": 0.0,
      "num_tokens": 37351588.0,
      "reward": 0.92438805103302,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.92438805103302,
      "rewards/reward_func/std": 0.0,
      "step": 1356,
      "step_time": 18.006115213036537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 138.1875,
      "completions/mean_terminated_length": 138.1875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.33333951979875565,
      "epoch": 0.06285317276516907,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004086427856236696,
      "kl": 0.001981131761567667,
      "learning_rate": 9.87438628994905e-07,
      "loss": 0.0001,
      "num_tokens": 37387639.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1357,
      "step_time": 18.09918538853526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 129.375,
      "completions/mean_terminated_length": 129.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.29088229686021805,
      "epoch": 0.06289949050486336,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008748992113396525,
      "kl": 0.0010925929382210597,
      "learning_rate": 9.874293654469661e-07,
      "loss": 0.0001,
      "num_tokens": 37413933.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1358,
      "step_time": 15.646902363747358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 198.1875,
      "completions/mean_terminated_length": 198.1875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.37544991075992584,
      "epoch": 0.06294580824455767,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0776798352599144,
      "kl": 0.0014099810214247555,
      "learning_rate": 9.874201018990273e-07,
      "loss": -0.0494,
      "num_tokens": 37435520.0,
      "reward": 0.11230379343032837,
      "reward_std": 0.30910545587539673,
      "rewards/reward_func/mean": 0.11230379343032837,
      "rewards/reward_func/std": 0.3091054856777191,
      "step": 1359,
      "step_time": 23.2933401837945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 164.125,
      "completions/mean_terminated_length": 164.125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.22312795370817184,
      "epoch": 0.06299212598425197,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12842656672000885,
      "kl": 0.0025809964863583446,
      "learning_rate": 9.874108383510884e-07,
      "loss": -0.1071,
      "num_tokens": 37465122.0,
      "reward": 0.518261730670929,
      "reward_std": 0.41461795568466187,
      "rewards/reward_func/mean": 0.518261730670929,
      "rewards/reward_func/std": 0.41461798548698425,
      "step": 1360,
      "step_time": 20.39900228381157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 123.625,
      "completions/mean_terminated_length": 123.625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.28706835955381393,
      "epoch": 0.06303844372394628,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009399523260071874,
      "kl": 0.0009960871102521196,
      "learning_rate": 9.874015748031495e-07,
      "loss": 0.0,
      "num_tokens": 37487004.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1361,
      "step_time": 14.537100818008184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 165.4375,
      "completions/mean_terminated_length": 165.4375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4404766261577606,
      "epoch": 0.06308476146364057,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004660594742745161,
      "kl": 0.002517607470508665,
      "learning_rate": 9.873923112552106e-07,
      "loss": 0.0001,
      "num_tokens": 37516483.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1362,
      "step_time": 20.90831706300378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 116.3125,
      "completions/mean_terminated_length": 116.3125,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.2569211646914482,
      "epoch": 0.06313107920333488,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003202579217031598,
      "kl": 0.0018702812667470425,
      "learning_rate": 9.87383047707272e-07,
      "loss": 0.0001,
      "num_tokens": 37536504.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1363,
      "step_time": 13.529532633721828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 213.5625,
      "completions/mean_terminated_length": 213.5625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2348765693604946,
      "epoch": 0.06317739694302918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009015243267640471,
      "kl": 0.0009753518679644912,
      "learning_rate": 9.87373784159333e-07,
      "loss": 0.0,
      "num_tokens": 37564641.0,
      "reward": 0.8510449528694153,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8510449528694153,
      "rewards/reward_func/std": 0.0,
      "step": 1364,
      "step_time": 23.56071775779128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 200.1875,
      "completions/mean_terminated_length": 200.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.4207221120595932,
      "epoch": 0.06322371468272349,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010644063586369157,
      "kl": 0.0013695932284463197,
      "learning_rate": 9.87364520611394e-07,
      "loss": 0.0001,
      "num_tokens": 37588404.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1365,
      "step_time": 22.19744211435318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 195.5,
      "completions/mean_terminated_length": 195.5,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3812335655093193,
      "epoch": 0.06327003242241779,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017430639127269387,
      "kl": 0.0016534636088181287,
      "learning_rate": 9.873552570634553e-07,
      "loss": 0.0001,
      "num_tokens": 37619708.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1366,
      "step_time": 22.575360488146544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 201.1875,
      "completions/mean_terminated_length": 201.1875,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.39354346692562103,
      "epoch": 0.0633163501621121,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020605064928531647,
      "kl": 0.0016023786447476596,
      "learning_rate": 9.873459935155165e-07,
      "loss": 0.0001,
      "num_tokens": 37653567.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1367,
      "step_time": 23.613397791981697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 145.4375,
      "completions/mean_terminated_length": 145.4375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.22626493498682976,
      "epoch": 0.06336266790180639,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026905538979917765,
      "kl": 0.0013218133826740086,
      "learning_rate": 9.873367299675776e-07,
      "loss": 0.0001,
      "num_tokens": 37673878.0,
      "reward": 0.8559471368789673,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8559471368789673,
      "rewards/reward_func/std": 0.0,
      "step": 1368,
      "step_time": 15.370013508945704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3830214664340019,
      "epoch": 0.0634089856415007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009626580867916346,
      "kl": 0.0013536399346776307,
      "learning_rate": 9.873274664196387e-07,
      "loss": 0.0001,
      "num_tokens": 37716229.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1369,
      "step_time": 21.070292565971613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 260.1875,
      "completions/mean_terminated_length": 260.1875,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "entropy": 0.1670360341668129,
      "epoch": 0.063455303381195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016005614306777716,
      "kl": 0.0009976603032555431,
      "learning_rate": 9.873182028716998e-07,
      "loss": 0.0,
      "num_tokens": 37755416.0,
      "reward": 0.9677302837371826,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9677302837371826,
      "rewards/reward_func/std": 0.0,
      "step": 1370,
      "step_time": 26.825051859021187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 182.625,
      "completions/mean_terminated_length": 182.625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.17469500377774239,
      "epoch": 0.0635016211208893,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0870797410607338,
      "kl": 0.0008207607170334086,
      "learning_rate": 9.87308939323761e-07,
      "loss": 0.008,
      "num_tokens": 37778546.0,
      "reward": 0.33064115047454834,
      "reward_std": 0.1531805396080017,
      "rewards/reward_func/mean": 0.33064115047454834,
      "rewards/reward_func/std": 0.1531805396080017,
      "step": 1371,
      "step_time": 19.08588433265686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 129.0625,
      "completions/mean_terminated_length": 129.0625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3086008355021477,
      "epoch": 0.0635479388605836,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017839764477685094,
      "kl": 0.001566823193570599,
      "learning_rate": 9.87299675775822e-07,
      "loss": 0.0001,
      "num_tokens": 37799539.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1372,
      "step_time": 13.565544940531254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 181.8125,
      "completions/mean_terminated_length": 181.8125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.13856897875666618,
      "epoch": 0.06359425660027791,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004261926806066185,
      "kl": 0.0005177938946872018,
      "learning_rate": 9.872904122278832e-07,
      "loss": 0.0,
      "num_tokens": 37832592.0,
      "reward": 0.9167169332504272,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9167169332504272,
      "rewards/reward_func/std": 0.0,
      "step": 1373,
      "step_time": 19.823412846773863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 121.5,
      "completions/mean_terminated_length": 121.5,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.24950135871767998,
      "epoch": 0.06364057433997221,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011088309111073613,
      "kl": 0.001101298417779617,
      "learning_rate": 9.872811486799443e-07,
      "loss": 0.0001,
      "num_tokens": 37852216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1374,
      "step_time": 12.886257383972406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 319.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 257.4375,
      "completions/mean_terminated_length": 257.4375,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.35273078829050064,
      "epoch": 0.06368689207966652,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.061511896550655365,
      "kl": 0.0013419757597148418,
      "learning_rate": 9.872718851320055e-07,
      "loss": -0.0959,
      "num_tokens": 37887695.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 1375,
      "step_time": 29.076378416270018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 132.6875,
      "completions/mean_terminated_length": 132.6875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2635773569345474,
      "epoch": 0.06373320981936081,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016760352300480008,
      "kl": 0.001321380288572982,
      "learning_rate": 9.872626215840668e-07,
      "loss": 0.0001,
      "num_tokens": 37912490.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1376,
      "step_time": 15.871300362050533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 107.4375,
      "completions/mean_terminated_length": 107.4375,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "entropy": 0.28921304643154144,
      "epoch": 0.06377952755905512,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010980983497574925,
      "kl": 0.0013212210615165532,
      "learning_rate": 9.872533580361277e-07,
      "loss": 0.0001,
      "num_tokens": 37935649.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1377,
      "step_time": 13.701858673244715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 184.0,
      "completions/mean_terminated_length": 184.0,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3393339291214943,
      "epoch": 0.06382584529874942,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12185642123222351,
      "kl": 0.002667451975867152,
      "learning_rate": 9.872440944881888e-07,
      "loss": -0.0369,
      "num_tokens": 37956753.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1378,
      "step_time": 20.53817980736494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 161.625,
      "completions/mean_terminated_length": 161.625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.22990813478827477,
      "epoch": 0.06387216303844373,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08274000138044357,
      "kl": 0.0011493629135657102,
      "learning_rate": 9.8723483094025e-07,
      "loss": -0.0076,
      "num_tokens": 37977931.0,
      "reward": 0.5090391635894775,
      "reward_std": 0.05525410920381546,
      "rewards/reward_func/mean": 0.5090391635894775,
      "rewards/reward_func/std": 0.05525410547852516,
      "step": 1379,
      "step_time": 17.247316155582666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 187.25,
      "completions/mean_terminated_length": 187.25,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4396393895149231,
      "epoch": 0.06391848077813803,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10089975595474243,
      "kl": 0.0018845154263544828,
      "learning_rate": 9.872255673923113e-07,
      "loss": 0.0387,
      "num_tokens": 38000111.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1380,
      "step_time": 21.832327533513308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 175.0625,
      "completions/mean_terminated_length": 175.0625,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3795551508665085,
      "epoch": 0.06396479851783234,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013261609710752964,
      "kl": 0.0017147823236882687,
      "learning_rate": 9.872163038443724e-07,
      "loss": 0.0001,
      "num_tokens": 38021600.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1381,
      "step_time": 18.155334655195475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 200.125,
      "completions/mean_terminated_length": 200.125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.4547497630119324,
      "epoch": 0.06401111625752663,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017529228935018182,
      "kl": 0.0019403231563046575,
      "learning_rate": 9.872070402964335e-07,
      "loss": 0.0001,
      "num_tokens": 38046546.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1382,
      "step_time": 24.914843030273914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 161.125,
      "completions/mean_terminated_length": 161.125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3788337707519531,
      "epoch": 0.06405743399722094,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012503358302637935,
      "kl": 0.0014750408881809562,
      "learning_rate": 9.871977767484947e-07,
      "loss": 0.0001,
      "num_tokens": 38067924.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1383,
      "step_time": 18.031631872057915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 155.8125,
      "completions/mean_terminated_length": 155.8125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.1991540491580963,
      "epoch": 0.06410375173691524,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019283011788502336,
      "kl": 0.001558351592393592,
      "learning_rate": 9.871885132005558e-07,
      "loss": 0.0001,
      "num_tokens": 38092593.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 1384,
      "step_time": 18.598839037120342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 158.625,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3112274408340454,
      "epoch": 0.06415006947660955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011574198724702,
      "kl": 0.0010043757501989603,
      "learning_rate": 9.87179249652617e-07,
      "loss": 0.0001,
      "num_tokens": 38115899.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1385,
      "step_time": 18.114937491714954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 199.25,
      "completions/mean_terminated_length": 199.25,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.44041285663843155,
      "epoch": 0.06419638721630384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001231481204740703,
      "kl": 0.001540001918328926,
      "learning_rate": 9.87169986104678e-07,
      "loss": 0.0001,
      "num_tokens": 38138351.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1386,
      "step_time": 20.419554706662893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 173.5625,
      "completions/mean_terminated_length": 173.5625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.37382249534130096,
      "epoch": 0.06424270495599815,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014059273526072502,
      "kl": 0.0013032660936005414,
      "learning_rate": 9.871607225567392e-07,
      "loss": 0.0001,
      "num_tokens": 38170504.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1387,
      "step_time": 20.110991090536118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 221.125,
      "completions/mean_terminated_length": 221.125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.47965505719184875,
      "epoch": 0.06428902269569245,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07660506665706635,
      "kl": 0.0021281543886289,
      "learning_rate": 9.871514590088003e-07,
      "loss": -0.0959,
      "num_tokens": 38201098.0,
      "reward": 0.0021462831646203995,
      "reward_std": 0.008585091680288315,
      "rewards/reward_func/mean": 0.0021462831646203995,
      "rewards/reward_func/std": 0.008585091680288315,
      "step": 1388,
      "step_time": 28.15257778763771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 143.5,
      "completions/mean_terminated_length": 143.5,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.34423044323921204,
      "epoch": 0.06433534043538676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012541321339085698,
      "kl": 0.0014289073587860912,
      "learning_rate": 9.871421954608614e-07,
      "loss": 0.0001,
      "num_tokens": 38226082.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1389,
      "step_time": 16.990872882306576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 208.375,
      "completions/mean_terminated_length": 208.375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.37246695905923843,
      "epoch": 0.06438165817508106,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09345704317092896,
      "kl": 0.0019007799564860761,
      "learning_rate": 9.871329319129225e-07,
      "loss": -0.0781,
      "num_tokens": 38250008.0,
      "reward": 0.28770166635513306,
      "reward_std": 0.2688891291618347,
      "rewards/reward_func/mean": 0.28770166635513306,
      "rewards/reward_func/std": 0.2688891291618347,
      "step": 1390,
      "step_time": 21.541621766984463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 135.875,
      "completions/mean_terminated_length": 135.875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.24001146480441093,
      "epoch": 0.06442797591477537,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012315355706959963,
      "kl": 0.0012011609505861998,
      "learning_rate": 9.871236683649837e-07,
      "loss": 0.0001,
      "num_tokens": 38269638.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1391,
      "step_time": 15.285881139338017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 156.375,
      "completions/mean_terminated_length": 156.375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.44844141602516174,
      "epoch": 0.06447429365446966,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013259457191452384,
      "kl": 0.0015823344583623111,
      "learning_rate": 9.871144048170448e-07,
      "loss": 0.0001,
      "num_tokens": 38312812.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1392,
      "step_time": 21.187436882406473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 131.8125,
      "completions/mean_terminated_length": 131.8125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.31581492722034454,
      "epoch": 0.06452061139416397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014123879373073578,
      "kl": 0.0013365675986278802,
      "learning_rate": 9.871051412691061e-07,
      "loss": 0.0001,
      "num_tokens": 38332729.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1393,
      "step_time": 14.641268495470285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 151.75,
      "completions/mean_terminated_length": 151.75,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.14506584033370018,
      "epoch": 0.06456692913385827,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016818299191072583,
      "kl": 0.0008369435818167403,
      "learning_rate": 9.870958777211673e-07,
      "loss": 0.0,
      "num_tokens": 38369861.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 1394,
      "step_time": 20.446988452225924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3028858155012131,
      "epoch": 0.06461324687355258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013620060635730624,
      "kl": 0.0012630632263608277,
      "learning_rate": 9.870866141732282e-07,
      "loss": 0.0001,
      "num_tokens": 38391037.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1395,
      "step_time": 15.609694961458445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.36677612364292145,
      "epoch": 0.06465956461324687,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024252363946288824,
      "kl": 0.0016025640070438385,
      "learning_rate": 9.870773506252895e-07,
      "loss": 0.0001,
      "num_tokens": 38417067.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1396,
      "step_time": 20.601425986737013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 362.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 220.8125,
      "completions/mean_terminated_length": 220.8125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3483920767903328,
      "epoch": 0.06470588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08609218895435333,
      "kl": 0.0013980403309687972,
      "learning_rate": 9.870680870773506e-07,
      "loss": -0.0195,
      "num_tokens": 38441176.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 1397,
      "step_time": 29.755803864449263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 189.6875,
      "completions/mean_terminated_length": 189.6875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.30657874792814255,
      "epoch": 0.06475220009263548,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07561224699020386,
      "kl": 0.0014870265440549701,
      "learning_rate": 9.870588235294118e-07,
      "loss": 0.0254,
      "num_tokens": 38467299.0,
      "reward": 0.41634833812713623,
      "reward_std": 0.33341115713119507,
      "rewards/reward_func/mean": 0.41634833812713623,
      "rewards/reward_func/std": 0.3334111273288727,
      "step": 1398,
      "step_time": 21.73190562427044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 167.25,
      "completions/mean_terminated_length": 167.25,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.21688029915094376,
      "epoch": 0.06479851783232979,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001277267001569271,
      "kl": 0.0009995947912102565,
      "learning_rate": 9.870495599814729e-07,
      "loss": 0.0,
      "num_tokens": 38490231.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 1399,
      "step_time": 17.318347416818142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 146.5,
      "completions/mean_terminated_length": 146.5,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.2146575041115284,
      "epoch": 0.06484483557202408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00092996348394081,
      "kl": 0.0008030234603211284,
      "learning_rate": 9.87040296433534e-07,
      "loss": 0.0,
      "num_tokens": 38513119.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 1400,
      "step_time": 16.49129395186901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 190.6875,
      "completions/mean_terminated_length": 190.6875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2414935976266861,
      "epoch": 0.0648911533117184,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10723493993282318,
      "kl": 0.0012726300628855824,
      "learning_rate": 9.870310328855951e-07,
      "loss": 0.0472,
      "num_tokens": 38537002.0,
      "reward": 0.5636385679244995,
      "reward_std": 0.15132883191108704,
      "rewards/reward_func/mean": 0.5636385679244995,
      "rewards/reward_func/std": 0.15132883191108704,
      "step": 1401,
      "step_time": 21.77669233083725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 125.375,
      "completions/mean_terminated_length": 125.375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2189713902771473,
      "epoch": 0.06493747105141269,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011065290309488773,
      "kl": 0.0009664138196967542,
      "learning_rate": 9.870217693376563e-07,
      "loss": 0.0,
      "num_tokens": 38556944.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1402,
      "step_time": 13.41659290716052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 186.1875,
      "completions/mean_terminated_length": 186.1875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.15766077861189842,
      "epoch": 0.064983788791107,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006077347788959742,
      "kl": 0.0014418928913073614,
      "learning_rate": 9.870125057897174e-07,
      "loss": 0.0001,
      "num_tokens": 38582371.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1403,
      "step_time": 19.582397300750017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 172.1875,
      "completions/mean_terminated_length": 172.1875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.2059054709970951,
      "epoch": 0.0650301065308013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002235325751826167,
      "kl": 0.0010782177560031414,
      "learning_rate": 9.870032422417785e-07,
      "loss": 0.0001,
      "num_tokens": 38605942.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1404,
      "step_time": 18.263175208121538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.1598551720380783,
      "epoch": 0.0650764242704956,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006986630032770336,
      "kl": 0.0007289715140359476,
      "learning_rate": 9.869939786938396e-07,
      "loss": 0.0,
      "num_tokens": 38636293.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 1405,
      "step_time": 18.041317779570818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 162.3125,
      "completions/mean_terminated_length": 162.3125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2856503203511238,
      "epoch": 0.0651227420101899,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15712042152881622,
      "kl": 0.0012672470038523898,
      "learning_rate": 9.86984715145901e-07,
      "loss": 0.0381,
      "num_tokens": 38657098.0,
      "reward": 0.862541675567627,
      "reward_std": 0.23001109063625336,
      "rewards/reward_func/mean": 0.862541675567627,
      "rewards/reward_func/std": 0.23001112043857574,
      "step": 1406,
      "step_time": 18.65338582545519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 158.375,
      "completions/mean_terminated_length": 158.375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.18908322602510452,
      "epoch": 0.06516905974988421,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09071298688650131,
      "kl": 0.0018847110186470672,
      "learning_rate": 9.86975451597962e-07,
      "loss": -0.0321,
      "num_tokens": 38693984.0,
      "reward": 0.9001584053039551,
      "reward_std": 0.05953400582075119,
      "rewards/reward_func/mean": 0.9001584053039551,
      "rewards/reward_func/std": 0.05953400954604149,
      "step": 1407,
      "step_time": 20.77293001487851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 122.75,
      "completions/mean_terminated_length": 122.75,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.3009890466928482,
      "epoch": 0.06521537748957851,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008008633740246296,
      "kl": 0.0011896536743734032,
      "learning_rate": 9.86966188050023e-07,
      "loss": 0.0001,
      "num_tokens": 38714764.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1408,
      "step_time": 15.979082588106394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 157.0625,
      "completions/mean_terminated_length": 157.0625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.1599237322807312,
      "epoch": 0.06526169522927282,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008259209571406245,
      "kl": 0.0008421517413808033,
      "learning_rate": 9.869569245020841e-07,
      "loss": 0.0,
      "num_tokens": 38737021.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 1409,
      "step_time": 17.57414334639907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 148.6875,
      "completions/mean_terminated_length": 148.6875,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.32970255613327026,
      "epoch": 0.06530801296896711,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008611916564404964,
      "kl": 0.0013319231511559337,
      "learning_rate": 9.869476609541455e-07,
      "loss": 0.0001,
      "num_tokens": 38768536.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1410,
      "step_time": 18.89270857349038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 198.5,
      "completions/mean_terminated_length": 198.5,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.30774350464344025,
      "epoch": 0.06535433070866142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006944688502699137,
      "kl": 0.0010836839501280338,
      "learning_rate": 9.869383974062066e-07,
      "loss": 0.0001,
      "num_tokens": 38793552.0,
      "reward": 0.38889557123184204,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.38889557123184204,
      "rewards/reward_func/std": 0.0,
      "step": 1411,
      "step_time": 21.76561936363578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 139.8125,
      "completions/mean_terminated_length": 139.8125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.28513088822364807,
      "epoch": 0.06540064844835572,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019855641294270754,
      "kl": 0.0013956335897091776,
      "learning_rate": 9.869291338582677e-07,
      "loss": 0.0001,
      "num_tokens": 38813325.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1412,
      "step_time": 15.08096693456173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 127.6875,
      "completions/mean_terminated_length": 127.6875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.35748113691806793,
      "epoch": 0.06544696618805003,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001560273696668446,
      "kl": 0.001687679992755875,
      "learning_rate": 9.869198703103288e-07,
      "loss": 0.0001,
      "num_tokens": 38840872.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1413,
      "step_time": 16.32881325483322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 190.625,
      "completions/mean_terminated_length": 190.625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.12357214279472828,
      "epoch": 0.06549328392774433,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06670991331338882,
      "kl": 0.0006051260425010696,
      "learning_rate": 9.8691060676239e-07,
      "loss": -0.004,
      "num_tokens": 38873682.0,
      "reward": 0.9907451272010803,
      "reward_std": 0.02528911456465721,
      "rewards/reward_func/mean": 0.9907451272010803,
      "rewards/reward_func/std": 0.025289107114076614,
      "step": 1414,
      "step_time": 21.369696903973818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 151.3125,
      "completions/mean_terminated_length": 151.3125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.35167011618614197,
      "epoch": 0.06553960166743864,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015633010771125555,
      "kl": 0.0016057019238360226,
      "learning_rate": 9.86901343214451e-07,
      "loss": 0.0001,
      "num_tokens": 38895799.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1415,
      "step_time": 15.864126328378916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 167.75,
      "completions/mean_terminated_length": 167.75,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.22239339724183083,
      "epoch": 0.06558591940713293,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09842777997255325,
      "kl": 0.0010389309318270534,
      "learning_rate": 9.868920796665122e-07,
      "loss": 0.0034,
      "num_tokens": 38916995.0,
      "reward": 0.39312544465065,
      "reward_std": 0.007969260215759277,
      "rewards/reward_func/mean": 0.39312544465065,
      "rewards/reward_func/std": 0.007969260215759277,
      "step": 1416,
      "step_time": 16.630940418690443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 159.25,
      "completions/mean_terminated_length": 159.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3057669699192047,
      "epoch": 0.06563223714682724,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002081182785332203,
      "kl": 0.001386734249535948,
      "learning_rate": 9.868828161185733e-07,
      "loss": 0.0001,
      "num_tokens": 38939975.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1417,
      "step_time": 16.80906194075942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 151.3125,
      "completions/mean_terminated_length": 151.3125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.33575063198804855,
      "epoch": 0.06567855488652154,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010036277817562222,
      "kl": 0.0013450011028908193,
      "learning_rate": 9.868735525706345e-07,
      "loss": 0.0001,
      "num_tokens": 38966508.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1418,
      "step_time": 16.723738331347704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 130.25,
      "completions/mean_terminated_length": 130.25,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.33988042175769806,
      "epoch": 0.06572487262621585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010299875866621733,
      "kl": 0.0013958606286905706,
      "learning_rate": 9.868642890226956e-07,
      "loss": 0.0001,
      "num_tokens": 38999232.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1419,
      "step_time": 17.246197946369648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 141.4375,
      "completions/mean_terminated_length": 141.4375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3560640439391136,
      "epoch": 0.06577119036591014,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010152118047699332,
      "kl": 0.001361678441753611,
      "learning_rate": 9.868550254747567e-07,
      "loss": 0.0001,
      "num_tokens": 39021911.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1420,
      "step_time": 15.976904805749655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 171.75,
      "completions/mean_terminated_length": 171.75,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4093324691057205,
      "epoch": 0.06581750810560445,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015361782861873507,
      "kl": 0.0014861428062431514,
      "learning_rate": 9.868457619268178e-07,
      "loss": 0.0001,
      "num_tokens": 39049523.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1421,
      "step_time": 19.95411391928792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 196.8125,
      "completions/mean_terminated_length": 196.8125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.193643506616354,
      "epoch": 0.06586382584529875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08457839488983154,
      "kl": 0.0010921327047981322,
      "learning_rate": 9.86836498378879e-07,
      "loss": -0.0378,
      "num_tokens": 39087440.0,
      "reward": 0.9686717987060547,
      "reward_std": 0.12531274557113647,
      "rewards/reward_func/mean": 0.9686717987060547,
      "rewards/reward_func/std": 0.12531273066997528,
      "step": 1422,
      "step_time": 22.384692903608084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 124.875,
      "completions/mean_terminated_length": 124.875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.25259891524910927,
      "epoch": 0.06591014358499306,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011868654983118176,
      "kl": 0.000967387793934904,
      "learning_rate": 9.868272348309403e-07,
      "loss": 0.0,
      "num_tokens": 39110910.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1423,
      "step_time": 14.067098706960678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.0,
      "completions/max_terminated_length": 127.0,
      "completions/mean_length": 106.5,
      "completions/mean_terminated_length": 106.5,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.3011389747262001,
      "epoch": 0.06595646132468735,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006358596961945295,
      "kl": 0.0024511981464456767,
      "learning_rate": 9.868179712830014e-07,
      "loss": 0.0001,
      "num_tokens": 39132022.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1424,
      "step_time": 12.666402902454138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 162.0,
      "completions/mean_terminated_length": 162.0,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3955685421824455,
      "epoch": 0.06600277906438166,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008592951926402748,
      "kl": 0.0011703874333761632,
      "learning_rate": 9.868087077350626e-07,
      "loss": 0.0001,
      "num_tokens": 39156838.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1425,
      "step_time": 18.08092623576522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 155.0625,
      "completions/mean_terminated_length": 155.0625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3590179830789566,
      "epoch": 0.06604909680407596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002121493685990572,
      "kl": 0.0016639675304759294,
      "learning_rate": 9.867994441871237e-07,
      "loss": 0.0001,
      "num_tokens": 39193223.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1426,
      "step_time": 19.95289271697402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 197.9375,
      "completions/mean_terminated_length": 197.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.31179778277873993,
      "epoch": 0.06609541454377027,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1187836229801178,
      "kl": 0.002213226573076099,
      "learning_rate": 9.867901806391848e-07,
      "loss": 0.0492,
      "num_tokens": 39224806.0,
      "reward": 0.7406715154647827,
      "reward_std": 0.28912854194641113,
      "rewards/reward_func/mean": 0.7406715154647827,
      "rewards/reward_func/std": 0.2891285717487335,
      "step": 1427,
      "step_time": 25.411048222333193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 194.5625,
      "completions/mean_terminated_length": 194.5625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2525024935603142,
      "epoch": 0.06614173228346457,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09274663776159286,
      "kl": 0.0013812848483212292,
      "learning_rate": 9.86780917091246e-07,
      "loss": -0.001,
      "num_tokens": 39249119.0,
      "reward": 0.6005731225013733,
      "reward_std": 0.16380806267261505,
      "rewards/reward_func/mean": 0.6005731225013733,
      "rewards/reward_func/std": 0.16380807757377625,
      "step": 1428,
      "step_time": 20.594968132674694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 189.3125,
      "completions/mean_terminated_length": 189.3125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.23059514537453651,
      "epoch": 0.06618805002315888,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.062491029500961304,
      "kl": 0.0010999125079251826,
      "learning_rate": 9.86771653543307e-07,
      "loss": -0.0285,
      "num_tokens": 39272916.0,
      "reward": 0.9853121042251587,
      "reward_std": 0.04013495892286301,
      "rewards/reward_func/mean": 0.9853121042251587,
      "rewards/reward_func/std": 0.04013495147228241,
      "step": 1429,
      "step_time": 19.905599888414145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 184.5625,
      "completions/mean_terminated_length": 184.5625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.20655705034732819,
      "epoch": 0.06623436776285317,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14017370343208313,
      "kl": 0.0018786978616844863,
      "learning_rate": 9.867623899953682e-07,
      "loss": -0.0422,
      "num_tokens": 39297725.0,
      "reward": 0.6085429787635803,
      "reward_std": 0.35655471682548523,
      "rewards/reward_func/mean": 0.6085429787635803,
      "rewards/reward_func/std": 0.3565547466278076,
      "step": 1430,
      "step_time": 19.59533415362239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 112.375,
      "completions/mean_terminated_length": 112.375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.30003518611192703,
      "epoch": 0.06628068550254748,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011676916619762778,
      "kl": 0.0016723115113563836,
      "learning_rate": 9.867531264474293e-07,
      "loss": 0.0001,
      "num_tokens": 39318339.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1431,
      "step_time": 12.637915696948767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 183.1875,
      "completions/mean_terminated_length": 183.1875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.1929231435060501,
      "epoch": 0.06632700324224178,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07338245958089828,
      "kl": 0.0008385853288928047,
      "learning_rate": 9.867438628994904e-07,
      "loss": -0.012,
      "num_tokens": 39348022.0,
      "reward": 0.6517519950866699,
      "reward_std": 0.08575544506311417,
      "rewards/reward_func/mean": 0.6517519950866699,
      "rewards/reward_func/std": 0.08575543761253357,
      "step": 1432,
      "step_time": 19.820004228502512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 198.9375,
      "completions/mean_terminated_length": 198.9375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.35739368200302124,
      "epoch": 0.06637332098193609,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09327326714992523,
      "kl": 0.0018914308166131377,
      "learning_rate": 9.867345993515516e-07,
      "loss": 0.1309,
      "num_tokens": 39370981.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 1433,
      "step_time": 22.179261937737465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 204.0,
      "completions/mean_terminated_length": 204.0,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.3161913976073265,
      "epoch": 0.06641963872163038,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08861406147480011,
      "kl": 0.0019559416105039418,
      "learning_rate": 9.867253358036127e-07,
      "loss": -0.0142,
      "num_tokens": 39408389.0,
      "reward": 0.9501120448112488,
      "reward_std": 0.014566393569111824,
      "rewards/reward_func/mean": 0.9501120448112488,
      "rewards/reward_func/std": 0.014566399157047272,
      "step": 1434,
      "step_time": 25.339416343718767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 334.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 202.9375,
      "completions/mean_terminated_length": 202.9375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.36778371781110764,
      "epoch": 0.0664659564613247,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0415370799601078,
      "kl": 0.0016239224059972912,
      "learning_rate": 9.867160722556738e-07,
      "loss": -0.0364,
      "num_tokens": 39431428.0,
      "reward": 5.785605026176199e-05,
      "reward_std": 0.0001243867736775428,
      "rewards/reward_func/mean": 5.785605026176199e-05,
      "rewards/reward_func/std": 0.0001243867736775428,
      "step": 1435,
      "step_time": 28.81412662193179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 176.6875,
      "completions/mean_terminated_length": 176.6875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.36970124393701553,
      "epoch": 0.06651227420101899,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00309493625536561,
      "kl": 0.00200174271594733,
      "learning_rate": 9.867068087077351e-07,
      "loss": 0.0001,
      "num_tokens": 39464127.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1436,
      "step_time": 20.601984571665525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.875,
      "completions/mean_terminated_length": 132.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3056706562638283,
      "epoch": 0.0665585919407133,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001805571955628693,
      "kl": 0.0016618611116427928,
      "learning_rate": 9.866975451597963e-07,
      "loss": 0.0001,
      "num_tokens": 39484637.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1437,
      "step_time": 14.69915597513318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 181.8125,
      "completions/mean_terminated_length": 181.8125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.32961641252040863,
      "epoch": 0.0666049096804076,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13558849692344666,
      "kl": 0.0017127757600974292,
      "learning_rate": 9.866882816118572e-07,
      "loss": 0.0372,
      "num_tokens": 39506714.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1438,
      "step_time": 21.642262279987335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 172.625,
      "completions/mean_terminated_length": 172.625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.4146069064736366,
      "epoch": 0.0666512274201019,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012880455469712615,
      "kl": 0.0014203835744410753,
      "learning_rate": 9.866790180639183e-07,
      "loss": 0.0001,
      "num_tokens": 39528004.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1439,
      "step_time": 19.393419571220875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 128.5625,
      "completions/mean_terminated_length": 128.5625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2672436833381653,
      "epoch": 0.0666975451597962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001372100436128676,
      "kl": 0.0011037438380299136,
      "learning_rate": 9.866697545159796e-07,
      "loss": 0.0001,
      "num_tokens": 39551101.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1440,
      "step_time": 14.55172961205244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2265946827828884,
      "epoch": 0.06674386289949051,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018938696011900902,
      "kl": 0.0014089698670431972,
      "learning_rate": 9.866604909680408e-07,
      "loss": 0.0001,
      "num_tokens": 39573240.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1441,
      "step_time": 19.77555612847209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 404.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 262.8125,
      "completions/mean_terminated_length": 262.8125,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.42162948846817017,
      "epoch": 0.0667901806391848,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07937724143266678,
      "kl": 0.0015220747154671699,
      "learning_rate": 9.866512274201019e-07,
      "loss": -0.2059,
      "num_tokens": 39602725.0,
      "reward": 0.17238262295722961,
      "reward_std": 0.36804524064064026,
      "rewards/reward_func/mean": 0.17238262295722961,
      "rewards/reward_func/std": 0.36804524064064026,
      "step": 1442,
      "step_time": 33.67408147081733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3209586590528488,
      "epoch": 0.06683649837887912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019692759960889816,
      "kl": 0.0015766523429192603,
      "learning_rate": 9.86641963872163e-07,
      "loss": 0.0001,
      "num_tokens": 39638527.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1443,
      "step_time": 17.754509408026934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 118.375,
      "completions/mean_terminated_length": 118.375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.288492776453495,
      "epoch": 0.06688281611857341,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015787968877702951,
      "kl": 0.001139527521445416,
      "learning_rate": 9.866327003242241e-07,
      "loss": 0.0001,
      "num_tokens": 39659685.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1444,
      "step_time": 14.985423538833857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 180.6875,
      "completions/mean_terminated_length": 180.6875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.38609933108091354,
      "epoch": 0.06692913385826772,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004827553406357765,
      "kl": 0.0024070857325568795,
      "learning_rate": 9.866234367762853e-07,
      "loss": 0.0001,
      "num_tokens": 39694064.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1445,
      "step_time": 21.36505849659443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 153.5,
      "completions/mean_terminated_length": 153.5,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.4466777443885803,
      "epoch": 0.06697545159796202,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007941923686303198,
      "kl": 0.0013845481153111905,
      "learning_rate": 9.866141732283464e-07,
      "loss": 0.0001,
      "num_tokens": 39732296.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1446,
      "step_time": 20.569972027093172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.4187733381986618,
      "epoch": 0.06702176933765633,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004035596270114183,
      "kl": 0.0014605073956772685,
      "learning_rate": 9.866049096804075e-07,
      "loss": 0.0001,
      "num_tokens": 39760149.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1447,
      "step_time": 18.4068068228662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 155.75,
      "completions/mean_terminated_length": 155.75,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.21700581908226013,
      "epoch": 0.06706808707735062,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1402587741613388,
      "kl": 0.0012336500367382541,
      "learning_rate": 9.865956461324686e-07,
      "loss": 0.1356,
      "num_tokens": 39786785.0,
      "reward": 0.42196887731552124,
      "reward_std": 0.11329527944326401,
      "rewards/reward_func/mean": 0.42196887731552124,
      "rewards/reward_func/std": 0.1132952868938446,
      "step": 1448,
      "step_time": 21.55864104256034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 169.5625,
      "completions/mean_terminated_length": 169.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3580230697989464,
      "epoch": 0.06711440481704493,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010121813975274563,
      "kl": 0.001544493658002466,
      "learning_rate": 9.865863825845298e-07,
      "loss": 0.0001,
      "num_tokens": 39839178.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1449,
      "step_time": 24.623450193554163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 162.875,
      "completions/mean_terminated_length": 162.875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.1634041853249073,
      "epoch": 0.06716072255673923,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000649867404717952,
      "kl": 0.0007056358299450949,
      "learning_rate": 9.86577119036591e-07,
      "loss": 0.0,
      "num_tokens": 39875624.0,
      "reward": 0.8668779134750366,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8668779134750366,
      "rewards/reward_func/std": 0.0,
      "step": 1450,
      "step_time": 20.678642854094505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 163.8125,
      "completions/mean_terminated_length": 163.8125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.15259314700961113,
      "epoch": 0.06720704029643354,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006768021266907454,
      "kl": 0.0007882158679421991,
      "learning_rate": 9.86567855488652e-07,
      "loss": 0.0,
      "num_tokens": 39902101.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 1451,
      "step_time": 17.91073151677847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 133.6875,
      "completions/mean_terminated_length": 133.6875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2859661877155304,
      "epoch": 0.06725335803612784,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023830877617001534,
      "kl": 0.0018887875776272267,
      "learning_rate": 9.865585919407131e-07,
      "loss": 0.0001,
      "num_tokens": 39922688.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1452,
      "step_time": 15.119699243456125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 221.75,
      "completions/mean_terminated_length": 221.75,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.29905904084444046,
      "epoch": 0.06729967577582215,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06141403689980507,
      "kl": 0.0010330545774195343,
      "learning_rate": 9.865493283927745e-07,
      "loss": 0.0127,
      "num_tokens": 39961964.0,
      "reward": 0.8056573867797852,
      "reward_std": 0.10677226632833481,
      "rewards/reward_func/mean": 0.8056573867797852,
      "rewards/reward_func/std": 0.10677226632833481,
      "step": 1453,
      "step_time": 27.148686934262514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 231.9375,
      "completions/mean_terminated_length": 231.9375,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "entropy": 0.2375781461596489,
      "epoch": 0.06734599351551644,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09551922231912613,
      "kl": 0.001087108044885099,
      "learning_rate": 9.865400648448356e-07,
      "loss": 0.011,
      "num_tokens": 39992699.0,
      "reward": 0.9854298830032349,
      "reward_std": 0.019426822662353516,
      "rewards/reward_func/mean": 0.9854298830032349,
      "rewards/reward_func/std": 0.019426824524998665,
      "step": 1454,
      "step_time": 23.67501976713538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 146.4375,
      "completions/mean_terminated_length": 146.4375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3509632274508476,
      "epoch": 0.06739231125521075,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005953608546406031,
      "kl": 0.0023052508768159896,
      "learning_rate": 9.865308012968967e-07,
      "loss": 0.0001,
      "num_tokens": 40013906.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1455,
      "step_time": 16.13047195971012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 165.1875,
      "completions/mean_terminated_length": 165.1875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.2485116682946682,
      "epoch": 0.06743862899490505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007991045131348073,
      "kl": 0.0010293241939507425,
      "learning_rate": 9.865215377489578e-07,
      "loss": 0.0001,
      "num_tokens": 40034709.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 1456,
      "step_time": 17.48965096846223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 180.0,
      "completions/mean_terminated_length": 180.0,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.4434218630194664,
      "epoch": 0.06748494673459936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033606907818466425,
      "kl": 0.0022161082015372813,
      "learning_rate": 9.86512274201019e-07,
      "loss": 0.0001,
      "num_tokens": 40058789.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1457,
      "step_time": 18.66258154064417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 163.875,
      "completions/mean_terminated_length": 163.875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.42968491464853287,
      "epoch": 0.06753126447429365,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005685332231223583,
      "kl": 0.004056319885421544,
      "learning_rate": 9.8650301065308e-07,
      "loss": 0.0002,
      "num_tokens": 40092819.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1458,
      "step_time": 19.91689220443368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 160.5,
      "completions/mean_terminated_length": 160.5,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.15658705309033394,
      "epoch": 0.06757758221398796,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006694571929983795,
      "kl": 0.000618590900558047,
      "learning_rate": 9.864937471051412e-07,
      "loss": 0.0,
      "num_tokens": 40116347.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 1459,
      "step_time": 17.509197983890772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 121.0,
      "completions/mean_terminated_length": 121.0,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2313786968588829,
      "epoch": 0.06762389995368226,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009354195208288729,
      "kl": 0.000870649964781478,
      "learning_rate": 9.864844835572023e-07,
      "loss": 0.0,
      "num_tokens": 40136315.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1460,
      "step_time": 12.83052709326148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 161.4375,
      "completions/mean_terminated_length": 161.4375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.24410752579569817,
      "epoch": 0.06767021769337657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011613599490374327,
      "kl": 0.0012364076392259449,
      "learning_rate": 9.864752200092635e-07,
      "loss": 0.0001,
      "num_tokens": 40160050.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1461,
      "step_time": 17.837164908647537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 112.25,
      "completions/mean_terminated_length": 112.25,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2768171429634094,
      "epoch": 0.06771653543307087,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001716109924018383,
      "kl": 0.0013961562945041806,
      "learning_rate": 9.864659564613246e-07,
      "loss": 0.0001,
      "num_tokens": 40180454.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1462,
      "step_time": 13.131119310855865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 251.1875,
      "completions/mean_terminated_length": 251.1875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.2666243202984333,
      "epoch": 0.06776285317276518,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08627095818519592,
      "kl": 0.0017540419066790491,
      "learning_rate": 9.864566929133857e-07,
      "loss": -0.092,
      "num_tokens": 40207017.0,
      "reward": 0.8939310312271118,
      "reward_std": 0.2388184517621994,
      "rewards/reward_func/mean": 0.8939310312271118,
      "rewards/reward_func/std": 0.2388184517621994,
      "step": 1463,
      "step_time": 27.345908522605896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 162.0,
      "completions/mean_terminated_length": 162.0,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3583075553178787,
      "epoch": 0.06780917091245947,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011339073535054922,
      "kl": 0.0013876005250494927,
      "learning_rate": 9.864474293654468e-07,
      "loss": 0.0001,
      "num_tokens": 40240185.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1464,
      "step_time": 21.53124388307333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 314.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 219.3125,
      "completions/mean_terminated_length": 219.3125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.37661032378673553,
      "epoch": 0.06785548865215378,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09709648042917252,
      "kl": 0.001658972178120166,
      "learning_rate": 9.86438165817508e-07,
      "loss": -0.1201,
      "num_tokens": 40278446.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1465,
      "step_time": 29.761932767927647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 163.875,
      "completions/mean_terminated_length": 163.875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.17612246423959732,
      "epoch": 0.06790180639184808,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017276330618187785,
      "kl": 0.0008722720958758146,
      "learning_rate": 9.864289022695693e-07,
      "loss": 0.0,
      "num_tokens": 40299484.0,
      "reward": 0.6563555598258972,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6563555598258972,
      "rewards/reward_func/std": 0.0,
      "step": 1466,
      "step_time": 17.53114926069975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 191.25,
      "completions/mean_terminated_length": 191.25,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4142700731754303,
      "epoch": 0.06794812413154239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010640016989782453,
      "kl": 0.0016559420037083328,
      "learning_rate": 9.864196387216304e-07,
      "loss": 0.0001,
      "num_tokens": 40329392.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1467,
      "step_time": 21.341175697743893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 156.0,
      "completions/mean_terminated_length": 156.0,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.33521513640880585,
      "epoch": 0.06799444187123668,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001112421858124435,
      "kl": 0.0010444036161061376,
      "learning_rate": 9.864103751736916e-07,
      "loss": 0.0001,
      "num_tokens": 40362160.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1468,
      "step_time": 19.364713236689568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 134.8125,
      "completions/mean_terminated_length": 134.8125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3196274861693382,
      "epoch": 0.06804075961093099,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016130013391375542,
      "kl": 0.001321357995038852,
      "learning_rate": 9.864011116257525e-07,
      "loss": 0.0001,
      "num_tokens": 40382189.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1469,
      "step_time": 16.84289249405265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 199.125,
      "completions/mean_terminated_length": 199.125,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.1728663481771946,
      "epoch": 0.06808707735062529,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009273255127482116,
      "kl": 0.0007691927021369338,
      "learning_rate": 9.863918480778138e-07,
      "loss": 0.0,
      "num_tokens": 40407407.0,
      "reward": 0.8464817404747009,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8464817404747009,
      "rewards/reward_func/std": 0.0,
      "step": 1470,
      "step_time": 21.170282408595085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.0,
      "completions/max_terminated_length": 127.0,
      "completions/mean_length": 109.5,
      "completions/mean_terminated_length": 109.5,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.2908935993909836,
      "epoch": 0.0681333950903196,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001340805203653872,
      "kl": 0.0012932246027048677,
      "learning_rate": 9.86382584529875e-07,
      "loss": 0.0001,
      "num_tokens": 40426871.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1471,
      "step_time": 12.124918069690466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 168.75,
      "completions/mean_terminated_length": 168.75,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.34815602749586105,
      "epoch": 0.0681797128300139,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004635367076843977,
      "kl": 0.0017149716150015593,
      "learning_rate": 9.86373320981936e-07,
      "loss": 0.0001,
      "num_tokens": 40461171.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1472,
      "step_time": 20.40032485499978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 195.1875,
      "completions/mean_terminated_length": 195.1875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.38137224316596985,
      "epoch": 0.0682260305697082,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007770671509206295,
      "kl": 0.001301302167121321,
      "learning_rate": 9.863640574339972e-07,
      "loss": 0.0001,
      "num_tokens": 40492854.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1473,
      "step_time": 23.277416292577982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 181.9375,
      "completions/mean_terminated_length": 181.9375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3474675193428993,
      "epoch": 0.0682723483094025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0800715908408165,
      "kl": 0.0016894775035325438,
      "learning_rate": 9.863547938860583e-07,
      "loss": -0.0142,
      "num_tokens": 40520517.0,
      "reward": 0.9474196434020996,
      "reward_std": 0.05430473014712334,
      "rewards/reward_func/mean": 0.9474196434020996,
      "rewards/reward_func/std": 0.05430473014712334,
      "step": 1474,
      "step_time": 20.104807291179895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 189.4375,
      "completions/mean_terminated_length": 189.4375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.20633366331458092,
      "epoch": 0.06831866604909681,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006397018441930413,
      "kl": 0.0008177153358701617,
      "learning_rate": 9.863455303381194e-07,
      "loss": 0.0,
      "num_tokens": 40556188.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1475,
      "step_time": 22.91086822375655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 212.8125,
      "completions/mean_terminated_length": 212.8125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.40713974833488464,
      "epoch": 0.0683649837887911,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08309012651443481,
      "kl": 0.001738260907586664,
      "learning_rate": 9.863362667901806e-07,
      "loss": -0.0043,
      "num_tokens": 40583769.0,
      "reward": 0.5439493060112,
      "reward_std": 0.43928495049476624,
      "rewards/reward_func/mean": 0.5439493060112,
      "rewards/reward_func/std": 0.43928495049476624,
      "step": 1476,
      "step_time": 24.10078265890479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 170.5625,
      "completions/mean_terminated_length": 170.5625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.23392557352781296,
      "epoch": 0.06841130152848542,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.179819718003273,
      "kl": 0.0026494267222005874,
      "learning_rate": 9.863270032422417e-07,
      "loss": 0.0009,
      "num_tokens": 40604786.0,
      "reward": 0.958366334438324,
      "reward_std": 0.04875630885362625,
      "rewards/reward_func/mean": 0.958366334438324,
      "rewards/reward_func/std": 0.04875630885362625,
      "step": 1477,
      "step_time": 18.110772479325533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 118.875,
      "completions/mean_terminated_length": 118.875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.23127169162034988,
      "epoch": 0.06845761926817971,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010967552661895752,
      "kl": 0.0012604213261511177,
      "learning_rate": 9.863177396943028e-07,
      "loss": 0.0001,
      "num_tokens": 40624272.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1478,
      "step_time": 12.11015397682786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 523.0,
      "completions/max_terminated_length": 523.0,
      "completions/mean_length": 288.0625,
      "completions/mean_terminated_length": 288.0625,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.34514622390270233,
      "epoch": 0.06850393700787402,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06308825314044952,
      "kl": 0.0014682095497846603,
      "learning_rate": 9.86308476146364e-07,
      "loss": 0.0653,
      "num_tokens": 40651281.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1479,
      "step_time": 40.92439138144255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 138.5625,
      "completions/mean_terminated_length": 138.5625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.28622929006814957,
      "epoch": 0.06855025474756832,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013986499980092049,
      "kl": 0.0013489331759046763,
      "learning_rate": 9.862992125984253e-07,
      "loss": 0.0001,
      "num_tokens": 40677050.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1480,
      "step_time": 15.85581860691309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 183.3125,
      "completions/mean_terminated_length": 183.3125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.2172854319214821,
      "epoch": 0.06859657248726263,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0795762836933136,
      "kl": 0.0007507414120482281,
      "learning_rate": 9.862899490504862e-07,
      "loss": -0.0639,
      "num_tokens": 40701039.0,
      "reward": 0.9469864368438721,
      "reward_std": 0.020694375038146973,
      "rewards/reward_func/mean": 0.9469864368438721,
      "rewards/reward_func/std": 0.020694376900792122,
      "step": 1481,
      "step_time": 19.871653094887733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 193.0625,
      "completions/mean_terminated_length": 193.0625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.20894937589764595,
      "epoch": 0.06864289022695692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010044968221336603,
      "kl": 0.000901092033018358,
      "learning_rate": 9.862806855025473e-07,
      "loss": 0.0,
      "num_tokens": 40729696.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 1482,
      "step_time": 21.80692085623741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 182.4375,
      "completions/mean_terminated_length": 182.4375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3963339775800705,
      "epoch": 0.06868920796665123,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014606897020712495,
      "kl": 0.0014750435075256974,
      "learning_rate": 9.862714219546086e-07,
      "loss": 0.0001,
      "num_tokens": 40753495.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1483,
      "step_time": 20.837008390575647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 111.8125,
      "completions/mean_terminated_length": 111.8125,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.2510620839893818,
      "epoch": 0.06873552570634553,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027503983583301306,
      "kl": 0.0013421807816484943,
      "learning_rate": 9.862621584066698e-07,
      "loss": 0.0001,
      "num_tokens": 40773092.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1484,
      "step_time": 13.886992286890745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 172.1875,
      "completions/mean_terminated_length": 172.1875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3748626112937927,
      "epoch": 0.06878184344603984,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009795506484806538,
      "kl": 0.0013817641884088516,
      "learning_rate": 9.862528948587309e-07,
      "loss": 0.0001,
      "num_tokens": 40808183.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1485,
      "step_time": 21.10917278006673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 196.75,
      "completions/mean_terminated_length": 196.75,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.4569511339068413,
      "epoch": 0.06882816118573414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003057107562199235,
      "kl": 0.0021614345896523446,
      "learning_rate": 9.86243631310792e-07,
      "loss": 0.0001,
      "num_tokens": 40836915.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1486,
      "step_time": 22.84413205832243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.33205533027648926,
      "epoch": 0.06887447892542845,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001092462451197207,
      "kl": 0.0013362477766349912,
      "learning_rate": 9.862343677628531e-07,
      "loss": 0.0001,
      "num_tokens": 40872756.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1487,
      "step_time": 19.739625692367554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 122.375,
      "completions/mean_terminated_length": 122.375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.30517494678497314,
      "epoch": 0.06892079666512274,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013137499336153269,
      "kl": 0.0014369545096997172,
      "learning_rate": 9.862251042149143e-07,
      "loss": 0.0001,
      "num_tokens": 40892970.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1488,
      "step_time": 13.604293052107096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 145.0,
      "completions/mean_terminated_length": 145.0,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3870902359485626,
      "epoch": 0.06896711440481705,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012158052995800972,
      "kl": 0.0013868208043277264,
      "learning_rate": 9.862158406669754e-07,
      "loss": 0.0001,
      "num_tokens": 40920314.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1489,
      "step_time": 18.90192151069641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 192.875,
      "completions/mean_terminated_length": 192.875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3637043759226799,
      "epoch": 0.06901343214451135,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009444152819924057,
      "kl": 0.0014103133289609104,
      "learning_rate": 9.862065771190365e-07,
      "loss": 0.0001,
      "num_tokens": 40954872.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1490,
      "step_time": 25.023595243692398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 159.4375,
      "completions/mean_terminated_length": 159.4375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.40260692685842514,
      "epoch": 0.06905974988420566,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007991600432433188,
      "kl": 0.0014214868424460292,
      "learning_rate": 9.861973135710976e-07,
      "loss": 0.0001,
      "num_tokens": 40989215.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1491,
      "step_time": 21.194200597703457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 120.75,
      "completions/mean_terminated_length": 120.75,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.30024923384189606,
      "epoch": 0.06910606762389995,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011232905089855194,
      "kl": 0.0013113229360897094,
      "learning_rate": 9.861880500231588e-07,
      "loss": 0.0001,
      "num_tokens": 41010619.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1492,
      "step_time": 15.170989360660315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 212.75,
      "completions/mean_terminated_length": 212.75,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.35055892914533615,
      "epoch": 0.06915238536359426,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07122951745986938,
      "kl": 0.0015935766277834773,
      "learning_rate": 9.8617878647522e-07,
      "loss": -0.0742,
      "num_tokens": 41035111.0,
      "reward": 0.17740625143051147,
      "reward_std": 0.12721239030361176,
      "rewards/reward_func/mean": 0.17740625143051147,
      "rewards/reward_func/std": 0.12721239030361176,
      "step": 1493,
      "step_time": 25.952363431453705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 144.625,
      "completions/mean_terminated_length": 144.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3645920604467392,
      "epoch": 0.06919870310328856,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001596534508280456,
      "kl": 0.0016429739189334214,
      "learning_rate": 9.86169522927281e-07,
      "loss": 0.0001,
      "num_tokens": 41088657.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1494,
      "step_time": 23.631266605108976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 189.375,
      "completions/mean_terminated_length": 189.375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.30949675291776657,
      "epoch": 0.06924502084298287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08504604548215866,
      "kl": 0.002443965640850365,
      "learning_rate": 9.861602593793421e-07,
      "loss": 0.0474,
      "num_tokens": 41114423.0,
      "reward": 0.611486554145813,
      "reward_std": 0.36575570702552795,
      "rewards/reward_func/mean": 0.611486554145813,
      "rewards/reward_func/std": 0.36575576663017273,
      "step": 1495,
      "step_time": 21.925276305526495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 189.0,
      "completions/mean_terminated_length": 189.0,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.16924650967121124,
      "epoch": 0.06929133858267716,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006864202441647649,
      "kl": 0.0007491152500733733,
      "learning_rate": 9.861509958314035e-07,
      "loss": 0.0,
      "num_tokens": 41168279.0,
      "reward": 0.5623413324356079,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5623413324356079,
      "rewards/reward_func/std": 0.0,
      "step": 1496,
      "step_time": 26.513740804046392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 171.8125,
      "completions/mean_terminated_length": 171.8125,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3513307720422745,
      "epoch": 0.06933765632237147,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002650262787938118,
      "kl": 0.0017704792262520641,
      "learning_rate": 9.861417322834646e-07,
      "loss": 0.0001,
      "num_tokens": 41200180.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1497,
      "step_time": 19.84450488165021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3050597086548805,
      "epoch": 0.06938397406206577,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011662907199934125,
      "kl": 0.0015261732041835785,
      "learning_rate": 9.861324687355257e-07,
      "loss": 0.0001,
      "num_tokens": 41236061.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1498,
      "step_time": 17.509271383285522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 134.8125,
      "completions/mean_terminated_length": 134.8125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2897963374853134,
      "epoch": 0.06943029180176008,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001060917042195797,
      "kl": 0.0011977645626757294,
      "learning_rate": 9.861232051875869e-07,
      "loss": 0.0001,
      "num_tokens": 41257994.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1499,
      "step_time": 14.279197268188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 185.75,
      "completions/mean_terminated_length": 185.75,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.31334809958934784,
      "epoch": 0.06947660954145438,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09188850969076157,
      "kl": 0.0020196071709506214,
      "learning_rate": 9.86113941639648e-07,
      "loss": -0.0171,
      "num_tokens": 41278694.0,
      "reward": 0.8301382660865784,
      "reward_std": 0.3240528106689453,
      "rewards/reward_func/mean": 0.8301382660865784,
      "rewards/reward_func/std": 0.3240528106689453,
      "step": 1500,
      "step_time": 20.810988426208496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 169.6875,
      "completions/mean_terminated_length": 169.6875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.20143620669841766,
      "epoch": 0.06952292728114869,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00185035087633878,
      "kl": 0.0014070851902943105,
      "learning_rate": 9.86104678091709e-07,
      "loss": 0.0001,
      "num_tokens": 41311921.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 1501,
      "step_time": 21.402443937957287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 150.1875,
      "completions/mean_terminated_length": 150.1875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.34827468544244766,
      "epoch": 0.06956924502084298,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002683652099221945,
      "kl": 0.0019498355686664581,
      "learning_rate": 9.860954145437702e-07,
      "loss": 0.0001,
      "num_tokens": 41334388.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1502,
      "step_time": 16.499029833823442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 182.875,
      "completions/mean_terminated_length": 182.875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.18773585185408592,
      "epoch": 0.06961556276053729,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003130219643935561,
      "kl": 0.001884453697130084,
      "learning_rate": 9.860861509958314e-07,
      "loss": 0.0001,
      "num_tokens": 41387554.0,
      "reward": 0.8890097737312317,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8890097737312317,
      "rewards/reward_func/std": 0.0,
      "step": 1503,
      "step_time": 26.752739500254393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 141.25,
      "completions/mean_terminated_length": 141.25,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.25347447022795677,
      "epoch": 0.06966188050023159,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002005944726988673,
      "kl": 0.0013062612706562504,
      "learning_rate": 9.860768874478925e-07,
      "loss": 0.0001,
      "num_tokens": 41408214.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1504,
      "step_time": 14.816963702440262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 126.1875,
      "completions/mean_terminated_length": 126.1875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2511373423039913,
      "epoch": 0.0697081982399259,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014657479478046298,
      "kl": 0.0011138530389871448,
      "learning_rate": 9.860676238999536e-07,
      "loss": 0.0001,
      "num_tokens": 41428889.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1505,
      "step_time": 13.845789287239313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 222.25,
      "completions/mean_terminated_length": 222.25,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.14946654438972473,
      "epoch": 0.0697545159796202,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005345660028979182,
      "kl": 0.0006829862395534292,
      "learning_rate": 9.860583603520147e-07,
      "loss": 0.0,
      "num_tokens": 41456813.0,
      "reward": 0.9636404514312744,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9636404514312744,
      "rewards/reward_func/std": 0.0,
      "step": 1506,
      "step_time": 23.404409043490887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 169.1875,
      "completions/mean_terminated_length": 169.1875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3296942487359047,
      "epoch": 0.0698008337193145,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017873396864160895,
      "kl": 0.0013602831750176847,
      "learning_rate": 9.860490968040759e-07,
      "loss": 0.0001,
      "num_tokens": 41481136.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1507,
      "step_time": 18.71742406859994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 234.6875,
      "completions/mean_terminated_length": 234.6875,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "entropy": 0.244209386408329,
      "epoch": 0.0698471514590088,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007023353246040642,
      "kl": 0.0011829751019831747,
      "learning_rate": 9.86039833256137e-07,
      "loss": 0.0001,
      "num_tokens": 41503851.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1508,
      "step_time": 21.888627737760544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 142.8125,
      "completions/mean_terminated_length": 142.8125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.33800843358039856,
      "epoch": 0.06989346919870311,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007219344261102378,
      "kl": 0.001116905506933108,
      "learning_rate": 9.86030569708198e-07,
      "loss": 0.0001,
      "num_tokens": 41529960.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1509,
      "step_time": 16.180447284132242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 128.3125,
      "completions/mean_terminated_length": 128.3125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2081788331270218,
      "epoch": 0.0699397869383974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009349219035357237,
      "kl": 0.0007721676956862211,
      "learning_rate": 9.860213061602594e-07,
      "loss": 0.0,
      "num_tokens": 41549389.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1510,
      "step_time": 13.564359836280346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 302.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 211.3125,
      "completions/mean_terminated_length": 211.3125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.3014207184314728,
      "epoch": 0.06998610467809172,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0804465040564537,
      "kl": 0.0013809468073304743,
      "learning_rate": 9.860120426123206e-07,
      "loss": -0.0494,
      "num_tokens": 41587394.0,
      "reward": 0.3422159254550934,
      "reward_std": 0.4596257507801056,
      "rewards/reward_func/mean": 0.3422159254550934,
      "rewards/reward_func/std": 0.459625780582428,
      "step": 1511,
      "step_time": 28.52531709894538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 129.125,
      "completions/mean_terminated_length": 129.125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.20699350163340569,
      "epoch": 0.07003242241778601,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12705107033252716,
      "kl": 0.0009369904728373513,
      "learning_rate": 9.860027790643815e-07,
      "loss": 0.0046,
      "num_tokens": 41609716.0,
      "reward": 0.24965901672840118,
      "reward_std": 0.008113396354019642,
      "rewards/reward_func/mean": 0.24965901672840118,
      "rewards/reward_func/std": 0.008113403804600239,
      "step": 1512,
      "step_time": 17.932420033961535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 129.5,
      "completions/mean_terminated_length": 129.5,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.22529035061597824,
      "epoch": 0.07007874015748032,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00819733738899231,
      "kl": 0.0019448845705483109,
      "learning_rate": 9.859935155164428e-07,
      "loss": 0.0001,
      "num_tokens": 41629596.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1513,
      "step_time": 15.249301470816135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 130.75,
      "completions/mean_terminated_length": 130.75,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.27332042157649994,
      "epoch": 0.07012505789717462,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030940512660890818,
      "kl": 0.0012383492721710354,
      "learning_rate": 9.85984251968504e-07,
      "loss": 0.0001,
      "num_tokens": 41660104.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1514,
      "step_time": 17.43046096712351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 150.3125,
      "completions/mean_terminated_length": 150.3125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.4052044004201889,
      "epoch": 0.07017137563686893,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012219988275319338,
      "kl": 0.0013732522202190012,
      "learning_rate": 9.85974988420565e-07,
      "loss": 0.0001,
      "num_tokens": 41697901.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1515,
      "step_time": 21.802428640425205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 174.3125,
      "completions/mean_terminated_length": 174.3125,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.4087982252240181,
      "epoch": 0.07021769337656322,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033047806937247515,
      "kl": 0.0025360305444337428,
      "learning_rate": 9.859657248726262e-07,
      "loss": 0.0001,
      "num_tokens": 41723666.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1516,
      "step_time": 21.03728961199522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 187.8125,
      "completions/mean_terminated_length": 187.8125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.3777107447385788,
      "epoch": 0.07026401111625753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004337935708463192,
      "kl": 0.0023344085493590683,
      "learning_rate": 9.859564613246873e-07,
      "loss": 0.0001,
      "num_tokens": 41748447.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1517,
      "step_time": 19.76603312790394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 175.0625,
      "completions/mean_terminated_length": 175.0625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.19641326740384102,
      "epoch": 0.07031032885595183,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007934992900118232,
      "kl": 0.00101024258765392,
      "learning_rate": 9.859471977767484e-07,
      "loss": 0.0001,
      "num_tokens": 41775056.0,
      "reward": 0.19390326738357544,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.19390326738357544,
      "rewards/reward_func/std": 0.0,
      "step": 1518,
      "step_time": 18.733803275972605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 161.8125,
      "completions/mean_terminated_length": 161.8125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.43163342773914337,
      "epoch": 0.07035664659564614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010543644893914461,
      "kl": 0.0015909954381641,
      "learning_rate": 9.859379342288096e-07,
      "loss": 0.0001,
      "num_tokens": 41819117.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1519,
      "step_time": 23.68850500881672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 230.5,
      "completions/mean_terminated_length": 230.5,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "entropy": 0.27073976024985313,
      "epoch": 0.07040296433534043,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0937320739030838,
      "kl": 0.001118386906455271,
      "learning_rate": 9.859286706808707e-07,
      "loss": -0.0047,
      "num_tokens": 41845861.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1520,
      "step_time": 23.97225707396865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 147.6875,
      "completions/mean_terminated_length": 147.6875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.351545050740242,
      "epoch": 0.07044928207503474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018799840472638607,
      "kl": 0.0013992716849315912,
      "learning_rate": 9.859194071329318e-07,
      "loss": 0.0001,
      "num_tokens": 41868080.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1521,
      "step_time": 17.113589253276587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 143.375,
      "completions/mean_terminated_length": 143.375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2863616496324539,
      "epoch": 0.07049559981472904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005100011359900236,
      "kl": 0.0014330961857922375,
      "learning_rate": 9.85910143584993e-07,
      "loss": 0.0001,
      "num_tokens": 41888838.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1522,
      "step_time": 15.039297252893448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 119.6875,
      "completions/mean_terminated_length": 119.6875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.287519246339798,
      "epoch": 0.07054191755442335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013172749895602465,
      "kl": 0.001129176904214546,
      "learning_rate": 9.859008800370543e-07,
      "loss": 0.0001,
      "num_tokens": 41911457.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1523,
      "step_time": 13.754213828593493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 230.25,
      "completions/mean_terminated_length": 230.25,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "entropy": 0.22019926458597183,
      "epoch": 0.07058823529411765,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006993401912041008,
      "kl": 0.0008566299948142841,
      "learning_rate": 9.858916164891154e-07,
      "loss": 0.0,
      "num_tokens": 41939349.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1524,
      "step_time": 23.889339812099934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 205.4375,
      "completions/mean_terminated_length": 205.4375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.3681313022971153,
      "epoch": 0.07063455303381196,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1134500578045845,
      "kl": 0.0015738861984573305,
      "learning_rate": 9.858823529411763e-07,
      "loss": -0.0365,
      "num_tokens": 41977212.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 1525,
      "step_time": 24.386278919875622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 314.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 269.75,
      "completions/mean_terminated_length": 269.75,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "entropy": 0.26344146579504013,
      "epoch": 0.07068087077350625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06881923973560333,
      "kl": 0.001262011195649393,
      "learning_rate": 9.858730893932376e-07,
      "loss": -0.0533,
      "num_tokens": 42017464.0,
      "reward": 0.6692327260971069,
      "reward_std": 0.3416149616241455,
      "rewards/reward_func/mean": 0.6692327260971069,
      "rewards/reward_func/std": 0.3416149914264679,
      "step": 1526,
      "step_time": 29.74383533746004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 154.125,
      "completions/mean_terminated_length": 154.125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.18717294186353683,
      "epoch": 0.07072718851320056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10462996363639832,
      "kl": 0.001257291398360394,
      "learning_rate": 9.858638258452988e-07,
      "loss": -0.0005,
      "num_tokens": 42039098.0,
      "reward": 0.9080443978309631,
      "reward_std": 0.03526148200035095,
      "rewards/reward_func/mean": 0.9080443978309631,
      "rewards/reward_func/std": 0.035261478275060654,
      "step": 1527,
      "step_time": 15.544252336025238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 219.6875,
      "completions/mean_terminated_length": 219.6875,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.2726113051176071,
      "epoch": 0.07077350625289486,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08319716900587082,
      "kl": 0.001257637224625796,
      "learning_rate": 9.8585456229736e-07,
      "loss": -0.0156,
      "num_tokens": 42076981.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 1528,
      "step_time": 24.489597510546446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 154.5,
      "completions/mean_terminated_length": 154.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.19127242267131805,
      "epoch": 0.07081982399258917,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09630709886550903,
      "kl": 0.0011548186303116381,
      "learning_rate": 9.85845298749421e-07,
      "loss": -0.051,
      "num_tokens": 42097373.0,
      "reward": 0.9364910125732422,
      "reward_std": 0.18305334448814392,
      "rewards/reward_func/mean": 0.9364910125732422,
      "rewards/reward_func/std": 0.18305335938930511,
      "step": 1529,
      "step_time": 17.424638710916042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.17711615934967995,
      "epoch": 0.07086614173228346,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1408398449420929,
      "kl": 0.00129914321587421,
      "learning_rate": 9.858360352014821e-07,
      "loss": 0.0449,
      "num_tokens": 42128620.0,
      "reward": 0.9340888261795044,
      "reward_std": 0.08788163214921951,
      "rewards/reward_func/mean": 0.9340888261795044,
      "rewards/reward_func/std": 0.08788162469863892,
      "step": 1530,
      "step_time": 20.511590659618378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 189.75,
      "completions/mean_terminated_length": 189.75,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.17410175502300262,
      "epoch": 0.07091245947197777,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0045576211996376514,
      "kl": 0.0012940284796059132,
      "learning_rate": 9.858267716535433e-07,
      "loss": 0.0001,
      "num_tokens": 42151368.0,
      "reward": 0.8553453087806702,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8553453087806702,
      "rewards/reward_func/std": 0.0,
      "step": 1531,
      "step_time": 18.73728959262371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 130.0625,
      "completions/mean_terminated_length": 130.0625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2945939749479294,
      "epoch": 0.07095877721167207,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010982095263898373,
      "kl": 0.0012439826678019017,
      "learning_rate": 9.858175081056044e-07,
      "loss": 0.0001,
      "num_tokens": 42172393.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1532,
      "step_time": 14.610783133655787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 141.5625,
      "completions/mean_terminated_length": 141.5625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2522287368774414,
      "epoch": 0.07100509495136638,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022103125229477882,
      "kl": 0.0011451515456428751,
      "learning_rate": 9.858082445576655e-07,
      "loss": 0.0001,
      "num_tokens": 42202178.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1533,
      "step_time": 17.727550856769085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 140.9375,
      "completions/mean_terminated_length": 140.9375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.32373616844415665,
      "epoch": 0.07105141269106068,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015049474313855171,
      "kl": 0.0013415070425253361,
      "learning_rate": 9.857989810097266e-07,
      "loss": 0.0001,
      "num_tokens": 42230017.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1534,
      "step_time": 16.30304079130292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 133.375,
      "completions/mean_terminated_length": 133.375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.3020930588245392,
      "epoch": 0.07109773043075499,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03848927095532417,
      "kl": 0.001073820938472636,
      "learning_rate": 9.857897174617878e-07,
      "loss": 0.0075,
      "num_tokens": 42251159.0,
      "reward": 0.0004132419126108289,
      "reward_std": 0.00011019785597454756,
      "rewards/reward_func/mean": 0.0004132419126108289,
      "rewards/reward_func/std": 0.00011019784869858995,
      "step": 1535,
      "step_time": 16.230277463793755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 181.9375,
      "completions/mean_terminated_length": 181.9375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.37558628618717194,
      "epoch": 0.07114404817044928,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028143664821982384,
      "kl": 0.0021767235593870282,
      "learning_rate": 9.857804539138491e-07,
      "loss": 0.0001,
      "num_tokens": 42273734.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1536,
      "step_time": 18.9822254255414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 132.0,
      "completions/mean_terminated_length": 132.0,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3253486528992653,
      "epoch": 0.07119036591014359,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019250869518145919,
      "kl": 0.0018611763371154666,
      "learning_rate": 9.8577119036591e-07,
      "loss": 0.0001,
      "num_tokens": 42295814.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1537,
      "step_time": 14.144557140767574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 234.875,
      "completions/mean_terminated_length": 234.875,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.28953490406274796,
      "epoch": 0.07123668364983789,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05726927891373634,
      "kl": 0.0012834136723540723,
      "learning_rate": 9.857619268179711e-07,
      "loss": 0.019,
      "num_tokens": 42320676.0,
      "reward": 0.005292844492942095,
      "reward_std": 0.0027755454648286104,
      "rewards/reward_func/mean": 0.005292844492942095,
      "rewards/reward_func/std": 0.0027755454648286104,
      "step": 1538,
      "step_time": 22.797854535281658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 142.9375,
      "completions/mean_terminated_length": 142.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3037726953625679,
      "epoch": 0.0712830013895322,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012993714772164822,
      "kl": 0.0015164695214480162,
      "learning_rate": 9.857526632700323e-07,
      "loss": 0.0001,
      "num_tokens": 42348339.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1539,
      "step_time": 16.37822227180004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 337.0,
      "completions/max_terminated_length": 337.0,
      "completions/mean_length": 308.25,
      "completions/mean_terminated_length": 308.25,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "entropy": 0.257300678640604,
      "epoch": 0.07132931912922649,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0826321393251419,
      "kl": 0.0015540139575023204,
      "learning_rate": 9.857433997220936e-07,
      "loss": 0.0212,
      "num_tokens": 42378407.0,
      "reward": 0.9962121248245239,
      "reward_std": 0.005802525207400322,
      "rewards/reward_func/mean": 0.9962121248245239,
      "rewards/reward_func/std": 0.005802526138722897,
      "step": 1540,
      "step_time": 28.582728251814842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 163.125,
      "completions/mean_terminated_length": 163.125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3340243548154831,
      "epoch": 0.0713756368689208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017559159314259887,
      "kl": 0.001430229633115232,
      "learning_rate": 9.857341361741547e-07,
      "loss": 0.0001,
      "num_tokens": 42407289.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1541,
      "step_time": 21.22570151463151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 154.875,
      "completions/mean_terminated_length": 154.875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.4333033561706543,
      "epoch": 0.0714219546086151,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011042456608265638,
      "kl": 0.0015356171934399754,
      "learning_rate": 9.857248726262159e-07,
      "loss": 0.0001,
      "num_tokens": 42457591.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1542,
      "step_time": 24.36034982651472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 118.625,
      "completions/mean_terminated_length": 118.625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.24477159976959229,
      "epoch": 0.07146827234830941,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0065881493501365185,
      "kl": 0.0025198623770847917,
      "learning_rate": 9.85715609078277e-07,
      "loss": 0.0001,
      "num_tokens": 42478753.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1543,
      "step_time": 13.17404094710946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 175.1875,
      "completions/mean_terminated_length": 175.1875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.21906909719109535,
      "epoch": 0.0715145900880037,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0927964523434639,
      "kl": 0.0017490903264842927,
      "learning_rate": 9.857063455303381e-07,
      "loss": -0.0622,
      "num_tokens": 42500164.0,
      "reward": 0.3209686875343323,
      "reward_std": 0.18616418540477753,
      "rewards/reward_func/mean": 0.3209686875343323,
      "rewards/reward_func/std": 0.18616418540477753,
      "step": 1544,
      "step_time": 20.35236304998398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 143.0,
      "completions/mean_terminated_length": 143.0,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.33880386501550674,
      "epoch": 0.07156090782769801,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019858982414007187,
      "kl": 0.0014867189747747034,
      "learning_rate": 9.856970819823992e-07,
      "loss": 0.0001,
      "num_tokens": 42522932.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1545,
      "step_time": 17.753128845244646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 210.4375,
      "completions/mean_terminated_length": 210.4375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.35662582516670227,
      "epoch": 0.07160722556739231,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09567058086395264,
      "kl": 0.0016411022515967488,
      "learning_rate": 9.856878184344604e-07,
      "loss": -0.0326,
      "num_tokens": 42557371.0,
      "reward": 0.7110782861709595,
      "reward_std": 0.3527936637401581,
      "rewards/reward_func/mean": 0.7110782861709595,
      "rewards/reward_func/std": 0.3527936339378357,
      "step": 1546,
      "step_time": 24.16016223654151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 214.875,
      "completions/mean_terminated_length": 214.875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.3172084540128708,
      "epoch": 0.07165354330708662,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07621504366397858,
      "kl": 0.0023754789726808667,
      "learning_rate": 9.856785548865215e-07,
      "loss": -0.0338,
      "num_tokens": 42579945.0,
      "reward": 0.5271327495574951,
      "reward_std": 0.31990692019462585,
      "rewards/reward_func/mean": 0.5271327495574951,
      "rewards/reward_func/std": 0.31990692019462585,
      "step": 1547,
      "step_time": 21.686159301549196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 129.4375,
      "completions/mean_terminated_length": 129.4375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2654934898018837,
      "epoch": 0.07169986104678092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008809834835119545,
      "kl": 0.0012236626353114843,
      "learning_rate": 9.856692913385826e-07,
      "loss": 0.0001,
      "num_tokens": 42602880.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1548,
      "step_time": 15.267893463373184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 436.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 276.9375,
      "completions/mean_terminated_length": 276.9375,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.48166823387145996,
      "epoch": 0.07174617878647523,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07281829416751862,
      "kl": 0.0012351717159617692,
      "learning_rate": 9.856600277906437e-07,
      "loss": 0.0372,
      "num_tokens": 42635519.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 1549,
      "step_time": 36.32584190368652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2776329219341278,
      "epoch": 0.07179249652616952,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011818609200417995,
      "kl": 0.0012460009602364153,
      "learning_rate": 9.856507642427049e-07,
      "loss": 0.0001,
      "num_tokens": 42660474.0,
      "reward": 5.965462696622126e-05,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 5.965462696622126e-05,
      "rewards/reward_func/std": 0.0,
      "step": 1550,
      "step_time": 17.04089403897524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 119.5625,
      "completions/mean_terminated_length": 119.5625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.23602842167019844,
      "epoch": 0.07183881426586383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013200322864577174,
      "kl": 0.0011846898560179397,
      "learning_rate": 9.85641500694766e-07,
      "loss": 0.0001,
      "num_tokens": 42680131.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1551,
      "step_time": 13.902039337903261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 117.0,
      "completions/max_terminated_length": 117.0,
      "completions/mean_length": 103.6875,
      "completions/mean_terminated_length": 103.6875,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 0.27024680376052856,
      "epoch": 0.07188513200555813,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014684591442346573,
      "kl": 0.001529490080429241,
      "learning_rate": 9.856322371468271e-07,
      "loss": 0.0001,
      "num_tokens": 42699710.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1552,
      "step_time": 12.134970366954803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 174.4375,
      "completions/mean_terminated_length": 174.4375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.2280416190624237,
      "epoch": 0.07193144974525244,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0778835192322731,
      "kl": 0.001227862056111917,
      "learning_rate": 9.856229735988884e-07,
      "loss": -0.0424,
      "num_tokens": 42723541.0,
      "reward": 0.9569141864776611,
      "reward_std": 0.034468624740839005,
      "rewards/reward_func/mean": 0.9569141864776611,
      "rewards/reward_func/std": 0.034468621015548706,
      "step": 1553,
      "step_time": 19.63005105406046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 123.6875,
      "completions/mean_terminated_length": 123.6875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.27046234905719757,
      "epoch": 0.07197776748494673,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001302762539125979,
      "kl": 0.0013518096529878676,
      "learning_rate": 9.856137100509496e-07,
      "loss": 0.0001,
      "num_tokens": 42743248.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1554,
      "step_time": 14.400965578854084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 135.875,
      "completions/mean_terminated_length": 135.875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.15213468298316002,
      "epoch": 0.07202408522464104,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001393636455759406,
      "kl": 0.0008806453843135387,
      "learning_rate": 9.856044465030105e-07,
      "loss": 0.0,
      "num_tokens": 42763646.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1555,
      "step_time": 15.216861758381128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 163.0,
      "completions/mean_terminated_length": 163.0,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.22033360600471497,
      "epoch": 0.07207040296433534,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1002715602517128,
      "kl": 0.0021226152894087136,
      "learning_rate": 9.855951829550718e-07,
      "loss": -0.0179,
      "num_tokens": 42791758.0,
      "reward": 0.5921446084976196,
      "reward_std": 0.2171451300382614,
      "rewards/reward_func/mean": 0.5921446084976196,
      "rewards/reward_func/std": 0.2171451449394226,
      "step": 1556,
      "step_time": 18.4462536200881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 187.75,
      "completions/mean_terminated_length": 187.75,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.2702718712389469,
      "epoch": 0.07211672070402965,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011533231008797884,
      "kl": 0.001254095055628568,
      "learning_rate": 9.85585919407133e-07,
      "loss": 0.0001,
      "num_tokens": 42818602.0,
      "reward": 0.694277822971344,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.694277822971344,
      "rewards/reward_func/std": 0.0,
      "step": 1557,
      "step_time": 22.43714876100421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 131.9375,
      "completions/mean_terminated_length": 131.9375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3226363807916641,
      "epoch": 0.07216303844372395,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008729996625334024,
      "kl": 0.0011182344460394233,
      "learning_rate": 9.85576655859194e-07,
      "loss": 0.0001,
      "num_tokens": 42841369.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1558,
      "step_time": 14.938069373369217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 376.0,
      "completions/max_terminated_length": 376.0,
      "completions/mean_length": 249.5625,
      "completions/mean_terminated_length": 249.5625,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.3340029790997505,
      "epoch": 0.07220935618341826,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09114965796470642,
      "kl": 0.00174188960227184,
      "learning_rate": 9.855673923112552e-07,
      "loss": -0.1493,
      "num_tokens": 42880370.0,
      "reward": 0.27733278274536133,
      "reward_std": 0.39612674713134766,
      "rewards/reward_func/mean": 0.27733278274536133,
      "rewards/reward_func/std": 0.39612674713134766,
      "step": 1559,
      "step_time": 33.74260265380144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 198.3125,
      "completions/mean_terminated_length": 198.3125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.24380797892808914,
      "epoch": 0.07225567392311255,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06857086718082428,
      "kl": 0.0008929253672249615,
      "learning_rate": 9.855581287633163e-07,
      "loss": -0.0055,
      "num_tokens": 42916183.0,
      "reward": 0.874759316444397,
      "reward_std": 0.10368803143501282,
      "rewards/reward_func/mean": 0.874759316444397,
      "rewards/reward_func/std": 0.10368802398443222,
      "step": 1560,
      "step_time": 22.03244859352708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 226.125,
      "completions/mean_terminated_length": 226.125,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.21797719597816467,
      "epoch": 0.07230199166280686,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08479955047369003,
      "kl": 0.0018371949554421008,
      "learning_rate": 9.855488652153774e-07,
      "loss": -0.0467,
      "num_tokens": 42939257.0,
      "reward": 0.9113311767578125,
      "reward_std": 0.24497197568416595,
      "rewards/reward_func/mean": 0.9113311767578125,
      "rewards/reward_func/std": 0.24497199058532715,
      "step": 1561,
      "step_time": 25.829491283744574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 163.1875,
      "completions/mean_terminated_length": 163.1875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.26966360956430435,
      "epoch": 0.07234830940250116,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2612365484237671,
      "kl": 0.0017238709842786193,
      "learning_rate": 9.855396016674386e-07,
      "loss": -0.169,
      "num_tokens": 42963420.0,
      "reward": 0.19720938801765442,
      "reward_std": 0.26262229681015015,
      "rewards/reward_func/mean": 0.19720938801765442,
      "rewards/reward_func/std": 0.26262232661247253,
      "step": 1562,
      "step_time": 20.299479123204947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 222.125,
      "completions/mean_terminated_length": 222.125,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.281155988574028,
      "epoch": 0.07239462714219547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015874463133513927,
      "kl": 0.0014850206207484007,
      "learning_rate": 9.855303381194997e-07,
      "loss": 0.0001,
      "num_tokens": 43000862.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1563,
      "step_time": 24.06028039380908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 206.1875,
      "completions/mean_terminated_length": 206.1875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.33932363986968994,
      "epoch": 0.07244094488188976,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06308679282665253,
      "kl": 0.0017190151556860656,
      "learning_rate": 9.855210745715608e-07,
      "loss": 0.0288,
      "num_tokens": 43025905.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1564,
      "step_time": 21.441253323107958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 170.0,
      "completions/mean_terminated_length": 170.0,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.34898870438337326,
      "epoch": 0.07248726262158407,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019805605988949537,
      "kl": 0.0015508705982938409,
      "learning_rate": 9.85511811023622e-07,
      "loss": 0.0001,
      "num_tokens": 43051713.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1565,
      "step_time": 18.230930637568235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 323.0,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 264.375,
      "completions/mean_terminated_length": 264.375,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "entropy": 0.33082958310842514,
      "epoch": 0.07253358036127837,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.062147557735443115,
      "kl": 0.001354100095340982,
      "learning_rate": 9.855025474756833e-07,
      "loss": 0.0063,
      "num_tokens": 43084087.0,
      "reward": 0.8738458156585693,
      "reward_std": 0.23302555084228516,
      "rewards/reward_func/mean": 0.8738458156585693,
      "rewards/reward_func/std": 0.23302556574344635,
      "step": 1566,
      "step_time": 28.961606048047543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 195.4375,
      "completions/mean_terminated_length": 195.4375,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.27759377658367157,
      "epoch": 0.07257989810097268,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00172783515881747,
      "kl": 0.0012512272805906832,
      "learning_rate": 9.854932839277444e-07,
      "loss": 0.0001,
      "num_tokens": 43106286.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 1567,
      "step_time": 20.849223010241985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 126.625,
      "completions/mean_terminated_length": 126.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2594129666686058,
      "epoch": 0.07262621584066697,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001080922782421112,
      "kl": 0.0012128448870498687,
      "learning_rate": 9.854840203798053e-07,
      "loss": 0.0001,
      "num_tokens": 43125944.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1568,
      "step_time": 13.997385706752539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 179.9375,
      "completions/mean_terminated_length": 179.9375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.284223347902298,
      "epoch": 0.07267253358036128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001301810727454722,
      "kl": 0.001221503916895017,
      "learning_rate": 9.854747568318664e-07,
      "loss": 0.0001,
      "num_tokens": 43149511.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 1569,
      "step_time": 18.766128912568092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 209.8125,
      "completions/mean_terminated_length": 209.8125,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.27987564355134964,
      "epoch": 0.07271885132005558,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011029279557988048,
      "kl": 0.001090632751584053,
      "learning_rate": 9.854654932839278e-07,
      "loss": 0.0001,
      "num_tokens": 43176004.0,
      "reward": 0.4111122786998749,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4111122786998749,
      "rewards/reward_func/std": 0.0,
      "step": 1570,
      "step_time": 21.385261174291372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 199.25,
      "completions/mean_terminated_length": 199.25,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.276499480009079,
      "epoch": 0.07276516905974989,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014302186900749803,
      "kl": 0.0012794569629477337,
      "learning_rate": 9.85456229735989e-07,
      "loss": 0.0001,
      "num_tokens": 43199784.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1571,
      "step_time": 20.255388107150793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 228.6875,
      "completions/mean_terminated_length": 228.6875,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.223116934299469,
      "epoch": 0.07281148679944419,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000991158070974052,
      "kl": 0.0010674456279957667,
      "learning_rate": 9.8544696618805e-07,
      "loss": 0.0001,
      "num_tokens": 43224595.0,
      "reward": 0.8668779134750366,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8668779134750366,
      "rewards/reward_func/std": 0.0,
      "step": 1572,
      "step_time": 22.18226484954357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 122.0625,
      "completions/mean_terminated_length": 122.0625,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "entropy": 0.2856691852211952,
      "epoch": 0.0728578045391385,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024775387719273567,
      "kl": 0.0017244815244339406,
      "learning_rate": 9.854377026401112e-07,
      "loss": 0.0001,
      "num_tokens": 43245108.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1573,
      "step_time": 13.81055148690939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 137.125,
      "completions/mean_terminated_length": 137.125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.22614125907421112,
      "epoch": 0.07290412227883279,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031758092809468508,
      "kl": 0.0012325557472649962,
      "learning_rate": 9.854284390921723e-07,
      "loss": 0.0001,
      "num_tokens": 43266486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1574,
      "step_time": 14.72935138642788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 153.0,
      "completions/mean_terminated_length": 153.0,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.18126782774925232,
      "epoch": 0.0729504400185271,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023152644280344248,
      "kl": 0.0010906383686233312,
      "learning_rate": 9.854191755442334e-07,
      "loss": 0.0001,
      "num_tokens": 43294134.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1575,
      "step_time": 17.822266314178705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 133.9375,
      "completions/mean_terminated_length": 133.9375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.31912506371736526,
      "epoch": 0.0729967577582214,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012292397441342473,
      "kl": 0.001754308439558372,
      "learning_rate": 9.854099119962945e-07,
      "loss": 0.0001,
      "num_tokens": 43321141.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1576,
      "step_time": 16.450386211276054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 129.8125,
      "completions/mean_terminated_length": 129.8125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2551668845117092,
      "epoch": 0.07304307549791571,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017605704488232732,
      "kl": 0.0014316203887574375,
      "learning_rate": 9.854006484483557e-07,
      "loss": 0.0001,
      "num_tokens": 43340706.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1577,
      "step_time": 13.634104192256927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 174.0,
      "completions/mean_terminated_length": 174.0,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.37115003913640976,
      "epoch": 0.07308939323761,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004548309370875359,
      "kl": 0.001881248492281884,
      "learning_rate": 9.853913849004168e-07,
      "loss": 0.0001,
      "num_tokens": 43364722.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1578,
      "step_time": 18.42423866316676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 138.6875,
      "completions/mean_terminated_length": 138.6875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.35234903544187546,
      "epoch": 0.07313571097730431,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015647612744942307,
      "kl": 0.0015877830155659467,
      "learning_rate": 9.85382121352478e-07,
      "loss": 0.0001,
      "num_tokens": 43400653.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1579,
      "step_time": 18.515632305294275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 158.8125,
      "completions/mean_terminated_length": 158.8125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3327949196100235,
      "epoch": 0.07318202871699861,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010463129729032516,
      "kl": 0.0013174892228562385,
      "learning_rate": 9.85372857804539e-07,
      "loss": 0.0001,
      "num_tokens": 43434346.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1580,
      "step_time": 22.90632711723447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 452.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 424.5625,
      "completions/mean_terminated_length": 424.5625,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "entropy": 0.16697721183300018,
      "epoch": 0.07322834645669292,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005113129736855626,
      "kl": 0.0006655202596448362,
      "learning_rate": 9.853635942566002e-07,
      "loss": 0.0,
      "num_tokens": 43472355.0,
      "reward": 0.7753521203994751,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7753521203994751,
      "rewards/reward_func/std": 0.0,
      "step": 1581,
      "step_time": 38.204998683184385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 127.0,
      "completions/mean_terminated_length": 127.0,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.34289418160915375,
      "epoch": 0.07327466419638722,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014903299743309617,
      "kl": 0.001732117059873417,
      "learning_rate": 9.853543307086613e-07,
      "loss": 0.0001,
      "num_tokens": 43495331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1582,
      "step_time": 14.250513847917318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.336424358189106,
      "epoch": 0.07332098193608153,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002108678687363863,
      "kl": 0.0015903627790976316,
      "learning_rate": 9.853450671607226e-07,
      "loss": 0.0001,
      "num_tokens": 43524531.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1583,
      "step_time": 19.401917461305857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 192.875,
      "completions/mean_terminated_length": 192.875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.36616717278957367,
      "epoch": 0.07336729967577582,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08924608677625656,
      "kl": 0.0015478063724003732,
      "learning_rate": 9.853358036127837e-07,
      "loss": -0.0963,
      "num_tokens": 43546737.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 1584,
      "step_time": 22.100545328110456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 164.3125,
      "completions/mean_terminated_length": 164.3125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.34544944763183594,
      "epoch": 0.07341361741547013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003837409196421504,
      "kl": 0.0018765542190521955,
      "learning_rate": 9.853265400648449e-07,
      "loss": 0.0001,
      "num_tokens": 43575014.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1585,
      "step_time": 18.930589731782675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 131.375,
      "completions/mean_terminated_length": 131.375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2863118126988411,
      "epoch": 0.07345993515516443,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011196363484486938,
      "kl": 0.0011951862688874826,
      "learning_rate": 9.853172765169058e-07,
      "loss": 0.0001,
      "num_tokens": 43602924.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1586,
      "step_time": 16.04774810373783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 205.25,
      "completions/mean_terminated_length": 205.25,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.2627539038658142,
      "epoch": 0.07350625289485874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001070684753358364,
      "kl": 0.0009951856918632984,
      "learning_rate": 9.853080129689671e-07,
      "loss": 0.0,
      "num_tokens": 43634160.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1587,
      "step_time": 21.734125968068838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 180.9375,
      "completions/mean_terminated_length": 180.9375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.39849239587783813,
      "epoch": 0.07355257063455303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00107945513445884,
      "kl": 0.0014529027102980763,
      "learning_rate": 9.852987494210282e-07,
      "loss": 0.0001,
      "num_tokens": 43664735.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1588,
      "step_time": 21.05367875471711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 222.375,
      "completions/mean_terminated_length": 222.375,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.3285161554813385,
      "epoch": 0.07359888837424734,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10005734860897064,
      "kl": 0.0024754175101406872,
      "learning_rate": 9.852894858730894e-07,
      "loss": 0.0074,
      "num_tokens": 43689813.0,
      "reward": 0.7110038995742798,
      "reward_std": 0.4143187403678894,
      "rewards/reward_func/mean": 0.7110038995742798,
      "rewards/reward_func/std": 0.4143187403678894,
      "step": 1589,
      "step_time": 24.597700908780098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 144.9375,
      "completions/mean_terminated_length": 144.9375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3973710164427757,
      "epoch": 0.07364520611394164,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014514324720948935,
      "kl": 0.001288296072743833,
      "learning_rate": 9.852802223251505e-07,
      "loss": 0.0001,
      "num_tokens": 43713028.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1590,
      "step_time": 15.701747439801693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 152.4375,
      "completions/mean_terminated_length": 152.4375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.16994201391935349,
      "epoch": 0.07369152385363595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014112734934315085,
      "kl": 0.0008843439863994718,
      "learning_rate": 9.852709587772116e-07,
      "loss": 0.0,
      "num_tokens": 43737995.0,
      "reward": 0.5736212730407715,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5736212730407715,
      "rewards/reward_func/std": 0.0,
      "step": 1591,
      "step_time": 16.465637356042862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 189.25,
      "completions/mean_terminated_length": 189.25,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4402531236410141,
      "epoch": 0.07373784159333024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018565183272585273,
      "kl": 0.0017671606037765741,
      "learning_rate": 9.852616952292727e-07,
      "loss": 0.0001,
      "num_tokens": 43759647.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1592,
      "step_time": 18.82415586337447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 130.75,
      "completions/mean_terminated_length": 130.75,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.16491558775305748,
      "epoch": 0.07378415933302455,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004695961717516184,
      "kl": 0.0017564288573339581,
      "learning_rate": 9.852524316813339e-07,
      "loss": 0.0001,
      "num_tokens": 43782939.0,
      "reward": 0.765928328037262,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.765928328037262,
      "rewards/reward_func/std": 0.0,
      "step": 1593,
      "step_time": 15.761767968535423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 263.125,
      "completions/mean_terminated_length": 263.125,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "entropy": 0.22686590254306793,
      "epoch": 0.07383047707271885,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08620725572109222,
      "kl": 0.0012763368431478739,
      "learning_rate": 9.85243168133395e-07,
      "loss": 0.0026,
      "num_tokens": 43822381.0,
      "reward": 0.7883949279785156,
      "reward_std": 0.0050514861941337585,
      "rewards/reward_func/mean": 0.7883949279785156,
      "rewards/reward_func/std": 0.005051496438682079,
      "step": 1594,
      "step_time": 28.364187084138393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.147944588214159,
      "epoch": 0.07387679481241316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010562414536252618,
      "kl": 0.0009138431632891297,
      "learning_rate": 9.852339045854561e-07,
      "loss": 0.0,
      "num_tokens": 43843749.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1595,
      "step_time": 15.635121572762728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 147.0625,
      "completions/mean_terminated_length": 147.0625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.1813713274896145,
      "epoch": 0.07392311255210746,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004045617301017046,
      "kl": 0.0015739940863568336,
      "learning_rate": 9.852246410375174e-07,
      "loss": 0.0001,
      "num_tokens": 43864390.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1596,
      "step_time": 15.721635963767767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 421.0,
      "completions/max_terminated_length": 421.0,
      "completions/mean_length": 345.125,
      "completions/mean_terminated_length": 345.125,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "entropy": 0.24516578018665314,
      "epoch": 0.07396943029180177,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07490377128124237,
      "kl": 0.0011413791944505647,
      "learning_rate": 9.852153774895786e-07,
      "loss": -0.0058,
      "num_tokens": 43901544.0,
      "reward": 0.9853454232215881,
      "reward_std": 0.008085109293460846,
      "rewards/reward_func/mean": 0.9853454232215881,
      "rewards/reward_func/std": 0.008085114881396294,
      "step": 1597,
      "step_time": 36.766721062362194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 135.9375,
      "completions/mean_terminated_length": 135.9375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.24951966106891632,
      "epoch": 0.07401574803149606,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004406257998198271,
      "kl": 0.001871462882263586,
      "learning_rate": 9.852061139416395e-07,
      "loss": 0.0001,
      "num_tokens": 43921367.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1598,
      "step_time": 15.098415672779083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 182.375,
      "completions/mean_terminated_length": 182.375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3961958587169647,
      "epoch": 0.07406206577119037,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018680575303733349,
      "kl": 0.0016966285184025764,
      "learning_rate": 9.851968503937006e-07,
      "loss": 0.0001,
      "num_tokens": 43943709.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1599,
      "step_time": 20.12541764602065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 172.5,
      "completions/mean_terminated_length": 172.5,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.26840293034911156,
      "epoch": 0.07410838351088467,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14280129969120026,
      "kl": 0.002122916339430958,
      "learning_rate": 9.85187586845762e-07,
      "loss": -0.0056,
      "num_tokens": 43980901.0,
      "reward": 0.5336841344833374,
      "reward_std": 0.451254665851593,
      "rewards/reward_func/mean": 0.5336841344833374,
      "rewards/reward_func/std": 0.4512546956539154,
      "step": 1600,
      "step_time": 22.47697388380766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 200.0,
      "completions/mean_terminated_length": 200.0,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.2988668344914913,
      "epoch": 0.07415470125057898,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10289546847343445,
      "kl": 0.0025755142560228705,
      "learning_rate": 9.85178323297823e-07,
      "loss": 0.0135,
      "num_tokens": 44007541.0,
      "reward": 0.1463262289762497,
      "reward_std": 0.17135973274707794,
      "rewards/reward_func/mean": 0.1463262289762497,
      "rewards/reward_func/std": 0.17135973274707794,
      "step": 1601,
      "step_time": 22.601786609739065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 171.1875,
      "completions/mean_terminated_length": 171.1875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.25361642614006996,
      "epoch": 0.07420101899027327,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08131257444620132,
      "kl": 0.0017854655161499977,
      "learning_rate": 9.851690597498842e-07,
      "loss": 0.0436,
      "num_tokens": 44029720.0,
      "reward": 0.8720642328262329,
      "reward_std": 0.2398042231798172,
      "rewards/reward_func/mean": 0.8720642328262329,
      "rewards/reward_func/std": 0.2398042231798172,
      "step": 1602,
      "step_time": 19.34695564210415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 150.3125,
      "completions/mean_terminated_length": 150.3125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3029944971203804,
      "epoch": 0.07424733672996758,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002730795880779624,
      "kl": 0.001728356524836272,
      "learning_rate": 9.851597962019453e-07,
      "loss": 0.0001,
      "num_tokens": 44051229.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1603,
      "step_time": 16.14696368575096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 173.5625,
      "completions/mean_terminated_length": 173.5625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.2576616480946541,
      "epoch": 0.07429365446966188,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08358705043792725,
      "kl": 0.0013877996534574777,
      "learning_rate": 9.851505326540064e-07,
      "loss": -0.0062,
      "num_tokens": 44079830.0,
      "reward": 0.9216919541358948,
      "reward_std": 0.037023257464170456,
      "rewards/reward_func/mean": 0.9216919541358948,
      "rewards/reward_func/std": 0.03702325373888016,
      "step": 1604,
      "step_time": 20.236038610339165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 164.375,
      "completions/mean_terminated_length": 164.375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.15927428007125854,
      "epoch": 0.07433997220935619,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006458433927036822,
      "kl": 0.0007009407709119841,
      "learning_rate": 9.851412691060676e-07,
      "loss": 0.0,
      "num_tokens": 44116348.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 1605,
      "step_time": 19.7960554510355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 124.375,
      "completions/mean_terminated_length": 124.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2811974287033081,
      "epoch": 0.07438628994905049,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008349127019755542,
      "kl": 0.0010238258837489411,
      "learning_rate": 9.851320055581287e-07,
      "loss": 0.0001,
      "num_tokens": 44140162.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1606,
      "step_time": 14.299282133579254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 180.75,
      "completions/mean_terminated_length": 180.75,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.34402915835380554,
      "epoch": 0.0744326076887448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014503782149404287,
      "kl": 0.0014425396511796862,
      "learning_rate": 9.851227420101898e-07,
      "loss": 0.0001,
      "num_tokens": 44160542.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1607,
      "step_time": 18.33135963231325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 256.75,
      "completions/mean_terminated_length": 256.75,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "entropy": 0.20412952452898026,
      "epoch": 0.07447892542843909,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001471884548664093,
      "kl": 0.0010712187213357538,
      "learning_rate": 9.85113478462251e-07,
      "loss": 0.0001,
      "num_tokens": 44195850.0,
      "reward": 0.4345982074737549,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4345982074737549,
      "rewards/reward_func/std": 0.0,
      "step": 1608,
      "step_time": 26.500352159142494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 167.8125,
      "completions/mean_terminated_length": 167.8125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3420803025364876,
      "epoch": 0.0745252431681334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014416154008358717,
      "kl": 0.0013742977171204984,
      "learning_rate": 9.85104214914312e-07,
      "loss": 0.0001,
      "num_tokens": 44216631.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1609,
      "step_time": 18.223748851567507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.20950258150696754,
      "epoch": 0.0745715609078277,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08828388899564743,
      "kl": 0.0010750300280051306,
      "learning_rate": 9.850949513663734e-07,
      "loss": -0.0267,
      "num_tokens": 44245278.0,
      "reward": 0.3630557060241699,
      "reward_std": 0.15361300110816956,
      "rewards/reward_func/mean": 0.3630557060241699,
      "rewards/reward_func/std": 0.15361300110816956,
      "step": 1610,
      "step_time": 19.29606306180358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 184.5,
      "completions/mean_terminated_length": 184.5,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4001855403184891,
      "epoch": 0.074617878647522,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1239161491394043,
      "kl": 0.0018652789876796305,
      "learning_rate": 9.850856878184343e-07,
      "loss": -0.0575,
      "num_tokens": 44266630.0,
      "reward": 0.23001110553741455,
      "reward_std": 0.4114563763141632,
      "rewards/reward_func/mean": 0.23001110553741455,
      "rewards/reward_func/std": 0.4114563763141632,
      "step": 1611,
      "step_time": 20.396337650716305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 185.25,
      "completions/mean_terminated_length": 185.25,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.25959864258766174,
      "epoch": 0.0746641963872163,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08245521783828735,
      "kl": 0.0014883537369314581,
      "learning_rate": 9.850764242704954e-07,
      "loss": 0.0429,
      "num_tokens": 44303002.0,
      "reward": 0.9382164478302002,
      "reward_std": 0.24713435769081116,
      "rewards/reward_func/mean": 0.9382164478302002,
      "rewards/reward_func/std": 0.24713437259197235,
      "step": 1612,
      "step_time": 23.64640225470066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 342.0,
      "completions/max_terminated_length": 342.0,
      "completions/mean_length": 299.75,
      "completions/mean_terminated_length": 299.75,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "entropy": 0.1900012157857418,
      "epoch": 0.07471051412691061,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06945158541202545,
      "kl": 0.001918523252243176,
      "learning_rate": 9.850671607225568e-07,
      "loss": -0.0014,
      "num_tokens": 44338950.0,
      "reward": 0.9522884488105774,
      "reward_std": 0.1303727775812149,
      "rewards/reward_func/mean": 0.9522884488105774,
      "rewards/reward_func/std": 0.1303727775812149,
      "step": 1613,
      "step_time": 30.280744925141335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 122.3125,
      "completions/mean_terminated_length": 122.3125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.3087422326207161,
      "epoch": 0.07475683186660491,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015073319664224982,
      "kl": 0.0014248942316044122,
      "learning_rate": 9.85057897174618e-07,
      "loss": 0.0001,
      "num_tokens": 44369915.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1614,
      "step_time": 16.27818715199828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 211.0625,
      "completions/mean_terminated_length": 211.0625,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.2519189082086086,
      "epoch": 0.07480314960629922,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08111374825239182,
      "kl": 0.0017523058922961354,
      "learning_rate": 9.85048633626679e-07,
      "loss": -0.0278,
      "num_tokens": 44392028.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1615,
      "step_time": 20.951961275190115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 157.8125,
      "completions/mean_terminated_length": 157.8125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3702063113451004,
      "epoch": 0.07484946734599351,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001280409749597311,
      "kl": 0.0014684416237287223,
      "learning_rate": 9.850393700787402e-07,
      "loss": 0.0001,
      "num_tokens": 44414841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1616,
      "step_time": 16.872747104614973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 119.6875,
      "completions/mean_terminated_length": 119.6875,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.3385022059082985,
      "epoch": 0.07489578508568782,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014231239911168814,
      "kl": 0.0013049719855189323,
      "learning_rate": 9.850301065308013e-07,
      "loss": 0.0001,
      "num_tokens": 44451044.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1617,
      "step_time": 16.985461961477995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 135.5625,
      "completions/mean_terminated_length": 135.5625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.26226063817739487,
      "epoch": 0.07494210282538212,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001386523130349815,
      "kl": 0.0012260834337212145,
      "learning_rate": 9.850208429828624e-07,
      "loss": 0.0001,
      "num_tokens": 44479293.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1618,
      "step_time": 15.900365706533194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 167.1875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.30687885731458664,
      "epoch": 0.07498842056507643,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018912419909611344,
      "kl": 0.0014387187839020044,
      "learning_rate": 9.850115794349235e-07,
      "loss": 0.0001,
      "num_tokens": 44500784.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1619,
      "step_time": 18.744129680097103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 357.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 305.375,
      "completions/mean_terminated_length": 305.375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.15258891135454178,
      "epoch": 0.07503473830477073,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04254131019115448,
      "kl": 0.0006999806792009622,
      "learning_rate": 9.850023158869847e-07,
      "loss": -0.1472,
      "num_tokens": 44529510.0,
      "reward": 0.6036472320556641,
      "reward_std": 0.225793719291687,
      "rewards/reward_func/mean": 0.6036472320556641,
      "rewards/reward_func/std": 0.225793719291687,
      "step": 1620,
      "step_time": 32.11163375899196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.125,
      "completions/mean_terminated_length": 179.125,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.1882282979786396,
      "epoch": 0.07508105604446504,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12540240585803986,
      "kl": 0.0011414197651902214,
      "learning_rate": 9.849930523390458e-07,
      "loss": -0.0573,
      "num_tokens": 44556328.0,
      "reward": 0.6468878984451294,
      "reward_std": 0.3517628312110901,
      "rewards/reward_func/mean": 0.6468878984451294,
      "rewards/reward_func/std": 0.3517628610134125,
      "step": 1621,
      "step_time": 20.640837877988815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 185.125,
      "completions/mean_terminated_length": 185.125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.39952103793621063,
      "epoch": 0.07512737378415933,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001758892904035747,
      "kl": 0.0016571315354667604,
      "learning_rate": 9.84983788791107e-07,
      "loss": 0.0001,
      "num_tokens": 44584762.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1622,
      "step_time": 19.941069405525923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 135.0,
      "completions/mean_terminated_length": 135.0,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.29836906492710114,
      "epoch": 0.07517369152385364,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037823189049959183,
      "kl": 0.0022289794869720936,
      "learning_rate": 9.84974525243168e-07,
      "loss": 0.0001,
      "num_tokens": 44606314.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1623,
      "step_time": 14.352275010198355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3897210881114006,
      "epoch": 0.07522000926354794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012487727217376232,
      "kl": 0.0014665639901068062,
      "learning_rate": 9.849652616952292e-07,
      "loss": 0.0001,
      "num_tokens": 44630213.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1624,
      "step_time": 16.667425740510225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 129.8125,
      "completions/mean_terminated_length": 129.8125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.27875569462776184,
      "epoch": 0.07526632700324225,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027796023059636354,
      "kl": 0.001816785748815164,
      "learning_rate": 9.849559981472903e-07,
      "loss": 0.0001,
      "num_tokens": 44652434.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1625,
      "step_time": 15.079639580100775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 173.6875,
      "completions/mean_terminated_length": 173.6875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.36077074706554413,
      "epoch": 0.07531264474293654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002555923303589225,
      "kl": 0.00194956932682544,
      "learning_rate": 9.849467345993516e-07,
      "loss": 0.0001,
      "num_tokens": 44698413.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1626,
      "step_time": 24.086150523275137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 184.875,
      "completions/mean_terminated_length": 184.875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3735076040029526,
      "epoch": 0.07535896248263085,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015340592944994569,
      "kl": 0.0015938914439175278,
      "learning_rate": 9.849374710514127e-07,
      "loss": 0.0001,
      "num_tokens": 44724267.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1627,
      "step_time": 20.119130540639162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 135.625,
      "completions/mean_terminated_length": 135.625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3603271543979645,
      "epoch": 0.07540528022232515,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019060707418248057,
      "kl": 0.0018551453249529004,
      "learning_rate": 9.849282075034739e-07,
      "loss": 0.0001,
      "num_tokens": 44746485.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1628,
      "step_time": 15.419531498104334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 203.3125,
      "completions/mean_terminated_length": 203.3125,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3615184426307678,
      "epoch": 0.07545159796201946,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10142161697149277,
      "kl": 0.002262524183606729,
      "learning_rate": 9.849189439555348e-07,
      "loss": -0.0191,
      "num_tokens": 44774042.0,
      "reward": 0.5301079750061035,
      "reward_std": 0.4834319055080414,
      "rewards/reward_func/mean": 0.5301079750061035,
      "rewards/reward_func/std": 0.48343193531036377,
      "step": 1629,
      "step_time": 21.876350447535515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 176.125,
      "completions/mean_terminated_length": 176.125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3909710720181465,
      "epoch": 0.07549791570171376,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09627868980169296,
      "kl": 0.006698896875604987,
      "learning_rate": 9.849096804075961e-07,
      "loss": -0.0761,
      "num_tokens": 44797948.0,
      "reward": 0.05655233934521675,
      "reward_std": 0.2262093424797058,
      "rewards/reward_func/mean": 0.05655233934521675,
      "rewards/reward_func/std": 0.226209357380867,
      "step": 1630,
      "step_time": 21.573106106370687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 130.9375,
      "completions/mean_terminated_length": 130.9375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2726520821452141,
      "epoch": 0.07554423344140807,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011840288061648607,
      "kl": 0.0011821826919913292,
      "learning_rate": 9.849004168596572e-07,
      "loss": 0.0001,
      "num_tokens": 44821163.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1631,
      "step_time": 16.576533257961273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 188.25,
      "completions/mean_terminated_length": 188.25,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.40685446560382843,
      "epoch": 0.07559055118110236,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001472765114158392,
      "kl": 0.0014241637545637786,
      "learning_rate": 9.848911533117184e-07,
      "loss": 0.0001,
      "num_tokens": 44851631.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1632,
      "step_time": 20.087729662656784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 373.0,
      "completions/max_terminated_length": 373.0,
      "completions/mean_length": 243.375,
      "completions/mean_terminated_length": 243.375,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.4612579941749573,
      "epoch": 0.07563686892079667,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09092710167169571,
      "kl": 0.0021810660255141556,
      "learning_rate": 9.848818897637795e-07,
      "loss": -0.133,
      "num_tokens": 44887845.0,
      "reward": 0.059552330523729324,
      "reward_std": 0.2382093220949173,
      "rewards/reward_func/mean": 0.059552330523729324,
      "rewards/reward_func/std": 0.2382093369960785,
      "step": 1633,
      "step_time": 33.17604085803032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 205.0,
      "completions/mean_terminated_length": 205.0,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.304825134575367,
      "epoch": 0.07568318666049097,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015276845078915358,
      "kl": 0.0012091417738702148,
      "learning_rate": 9.848726262158406e-07,
      "loss": 0.0001,
      "num_tokens": 44912869.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1634,
      "step_time": 21.763545881956816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 161.4375,
      "completions/mean_terminated_length": 161.4375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.194634061306715,
      "epoch": 0.07572950440018528,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013067872496321797,
      "kl": 0.0009507545328233391,
      "learning_rate": 9.848633626679017e-07,
      "loss": 0.0,
      "num_tokens": 44962972.0,
      "reward": 0.8890097737312317,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8890097737312317,
      "rewards/reward_func/std": 0.0,
      "step": 1635,
      "step_time": 23.300194274634123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 183.125,
      "completions/mean_terminated_length": 183.125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.2663589380681515,
      "epoch": 0.07577582213987957,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08443693071603775,
      "kl": 0.0018062372982967645,
      "learning_rate": 9.848540991199629e-07,
      "loss": -0.0166,
      "num_tokens": 44988190.0,
      "reward": 0.07100986689329147,
      "reward_std": 0.0033722228836268187,
      "rewards/reward_func/mean": 0.07100986689329147,
      "rewards/reward_func/std": 0.003372224047780037,
      "step": 1636,
      "step_time": 19.75159054249525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 163.3125,
      "completions/mean_terminated_length": 163.3125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.38820867985486984,
      "epoch": 0.07582213987957388,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00270209857262671,
      "kl": 0.0018760527891572565,
      "learning_rate": 9.84844835572024e-07,
      "loss": 0.0001,
      "num_tokens": 45012515.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1637,
      "step_time": 17.12164083123207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.4071737304329872,
      "epoch": 0.07586845761926818,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004578940104693174,
      "kl": 0.002200766874011606,
      "learning_rate": 9.848355720240851e-07,
      "loss": 0.0001,
      "num_tokens": 45034874.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1638,
      "step_time": 17.77417979389429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 171.625,
      "completions/mean_terminated_length": 171.625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.383428193628788,
      "epoch": 0.07591477535896249,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013156926725059748,
      "kl": 0.001637956127524376,
      "learning_rate": 9.848263084761462e-07,
      "loss": 0.0001,
      "num_tokens": 45060996.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1639,
      "step_time": 19.570538219064474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 171.8125,
      "completions/mean_terminated_length": 171.8125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.38874175399541855,
      "epoch": 0.07596109309865678,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013463308569043875,
      "kl": 0.0016464151558466256,
      "learning_rate": 9.848170449282076e-07,
      "loss": 0.0001,
      "num_tokens": 45095553.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1640,
      "step_time": 21.0771097317338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 183.375,
      "completions/mean_terminated_length": 183.375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.25370409339666367,
      "epoch": 0.0760074108383511,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.103360615670681,
      "kl": 0.0014762329519726336,
      "learning_rate": 9.848077813802685e-07,
      "loss": -0.0331,
      "num_tokens": 45121559.0,
      "reward": 0.8410100936889648,
      "reward_std": 0.2242693305015564,
      "rewards/reward_func/mean": 0.8410100936889648,
      "rewards/reward_func/std": 0.2242693454027176,
      "step": 1641,
      "step_time": 20.875794924795628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 118.3125,
      "completions/mean_terminated_length": 118.3125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.3047468140721321,
      "epoch": 0.07605372857804539,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015164186479523778,
      "kl": 0.001252879883395508,
      "learning_rate": 9.847985178323296e-07,
      "loss": 0.0001,
      "num_tokens": 45144668.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1642,
      "step_time": 13.751232896000147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 117.0,
      "completions/mean_terminated_length": 117.0,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.24536456167697906,
      "epoch": 0.0761000463177397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016447966918349266,
      "kl": 0.0011007676948793232,
      "learning_rate": 9.84789254284391e-07,
      "loss": 0.0001,
      "num_tokens": 45164012.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1643,
      "step_time": 12.875072479248047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 120.0,
      "completions/mean_terminated_length": 120.0,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.26822152733802795,
      "epoch": 0.076146364057434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014926041476428509,
      "kl": 0.00132352503715083,
      "learning_rate": 9.84779990736452e-07,
      "loss": 0.0001,
      "num_tokens": 45183548.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1644,
      "step_time": 13.561577666550875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 116.4375,
      "completions/mean_terminated_length": 116.4375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.34725936502218246,
      "epoch": 0.0761926817971283,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010282954899594188,
      "kl": 0.0014080363616812974,
      "learning_rate": 9.847707271885132e-07,
      "loss": 0.0001,
      "num_tokens": 45206627.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1645,
      "step_time": 14.227275379002094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 202.625,
      "completions/mean_terminated_length": 202.625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.37325354665517807,
      "epoch": 0.0762389995368226,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016776693519204855,
      "kl": 0.0019049591792281717,
      "learning_rate": 9.847614636405743e-07,
      "loss": 0.0001,
      "num_tokens": 45231533.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1646,
      "step_time": 20.6542307138443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 175.75,
      "completions/mean_terminated_length": 175.75,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4133910909295082,
      "epoch": 0.07628531727651691,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022800853475928307,
      "kl": 0.0018994792480953038,
      "learning_rate": 9.847522000926355e-07,
      "loss": 0.0001,
      "num_tokens": 45252281.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1647,
      "step_time": 18.645898014307022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 129.8125,
      "completions/mean_terminated_length": 129.8125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.25476617366075516,
      "epoch": 0.07633163501621121,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002013625344261527,
      "kl": 0.0015150157851167023,
      "learning_rate": 9.847429365446966e-07,
      "loss": 0.0001,
      "num_tokens": 45272870.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1648,
      "step_time": 14.163351621478796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 167.375,
      "completions/mean_terminated_length": 167.375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.15188464522361755,
      "epoch": 0.07637795275590552,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06743628531694412,
      "kl": 0.0006581193738384172,
      "learning_rate": 9.847336729967577e-07,
      "loss": -0.0468,
      "num_tokens": 45300924.0,
      "reward": 0.856032133102417,
      "reward_std": 0.22827444970607758,
      "rewards/reward_func/mean": 0.856032133102417,
      "rewards/reward_func/std": 0.22827443480491638,
      "step": 1649,
      "step_time": 18.76086140051484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 197.0625,
      "completions/mean_terminated_length": 197.0625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.21695607155561447,
      "epoch": 0.07642427049559981,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0834154337644577,
      "kl": 0.001183260334073566,
      "learning_rate": 9.847244094488188e-07,
      "loss": -0.0288,
      "num_tokens": 45326669.0,
      "reward": 0.4902341961860657,
      "reward_std": 0.06334785372018814,
      "rewards/reward_func/mean": 0.4902341961860657,
      "rewards/reward_func/std": 0.06334785372018814,
      "step": 1650,
      "step_time": 20.830961029976606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 191.6875,
      "completions/mean_terminated_length": 191.6875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.4435461536049843,
      "epoch": 0.07647058823529412,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008816600893624127,
      "kl": 0.0012813075154554099,
      "learning_rate": 9.8471514590088e-07,
      "loss": 0.0001,
      "num_tokens": 45355416.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1651,
      "step_time": 20.361383739858866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 406.0,
      "completions/max_terminated_length": 406.0,
      "completions/mean_length": 310.8125,
      "completions/mean_terminated_length": 310.8125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.2690752036869526,
      "epoch": 0.07651690597498842,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.057550378143787384,
      "kl": 0.0015795712533872575,
      "learning_rate": 9.84705882352941e-07,
      "loss": -0.169,
      "num_tokens": 45382005.0,
      "reward": 0.8120216131210327,
      "reward_std": 0.3954293727874756,
      "rewards/reward_func/mean": 0.8120216131210327,
      "rewards/reward_func/std": 0.3954293727874756,
      "step": 1652,
      "step_time": 32.91077008843422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.21322758495807648,
      "epoch": 0.07656322371468273,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.076081283390522,
      "kl": 0.0016789287910796702,
      "learning_rate": 9.846966188050024e-07,
      "loss": -0.04,
      "num_tokens": 45404445.0,
      "reward": 0.6782035231590271,
      "reward_std": 0.08704688400030136,
      "rewards/reward_func/mean": 0.6782035231590271,
      "rewards/reward_func/std": 0.08704687654972076,
      "step": 1653,
      "step_time": 18.4552771858871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 134.8125,
      "completions/mean_terminated_length": 134.8125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.20299510657787323,
      "epoch": 0.07660954145437703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019456911832094193,
      "kl": 0.0011640992743195966,
      "learning_rate": 9.846873552570633e-07,
      "loss": 0.0001,
      "num_tokens": 45424106.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1654,
      "step_time": 15.224860787391663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 148.875,
      "completions/mean_terminated_length": 148.875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.15970367565751076,
      "epoch": 0.07665585919407134,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28535717725753784,
      "kl": 0.002173017419409007,
      "learning_rate": 9.846780917091245e-07,
      "loss": -0.033,
      "num_tokens": 45461304.0,
      "reward": 0.9068578481674194,
      "reward_std": 0.20066522061824799,
      "rewards/reward_func/mean": 0.9068578481674194,
      "rewards/reward_func/std": 0.20066523551940918,
      "step": 1655,
      "step_time": 19.46072293817997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 178.4375,
      "completions/mean_terminated_length": 178.4375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.40137291699647903,
      "epoch": 0.07670217693376563,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002324209315702319,
      "kl": 0.0016238936514128,
      "learning_rate": 9.846688281611856e-07,
      "loss": 0.0001,
      "num_tokens": 45485135.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1656,
      "step_time": 18.17797204479575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 359.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 270.1875,
      "completions/mean_terminated_length": 270.1875,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "entropy": 0.4296865537762642,
      "epoch": 0.07674849467345994,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07285812497138977,
      "kl": 0.0013943906524218619,
      "learning_rate": 9.84659564613247e-07,
      "loss": -0.1114,
      "num_tokens": 45515570.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1657,
      "step_time": 30.241368554532528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 168.4375,
      "completions/mean_terminated_length": 168.4375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.38904421031475067,
      "epoch": 0.07679481241315424,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00254133902490139,
      "kl": 0.0017448347643949091,
      "learning_rate": 9.84650301065308e-07,
      "loss": 0.0001,
      "num_tokens": 45550745.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1658,
      "step_time": 20.820831935852766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 138.625,
      "completions/mean_terminated_length": 138.625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.29940513521432877,
      "epoch": 0.07684113015284855,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006967560038901865,
      "kl": 0.0010478577751200646,
      "learning_rate": 9.846410375173692e-07,
      "loss": 0.0001,
      "num_tokens": 45578579.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1659,
      "step_time": 16.639745496213436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 150.625,
      "completions/mean_terminated_length": 150.625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3030737563967705,
      "epoch": 0.07688744789254284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003103468334302306,
      "kl": 0.001979161344934255,
      "learning_rate": 9.846317739694303e-07,
      "loss": 0.0001,
      "num_tokens": 45601181.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1660,
      "step_time": 15.522049743682146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 352.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 210.125,
      "completions/mean_terminated_length": 210.125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3735486790537834,
      "epoch": 0.07693376563223715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08614269644021988,
      "kl": 0.0014472455659415573,
      "learning_rate": 9.846225104214914e-07,
      "loss": -0.0463,
      "num_tokens": 45623711.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 1661,
      "step_time": 28.701392497867346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 112.8125,
      "completions/mean_terminated_length": 112.8125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.2954092025756836,
      "epoch": 0.07698008337193145,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011567731853574514,
      "kl": 0.0012997614976484329,
      "learning_rate": 9.846132468735525e-07,
      "loss": 0.0001,
      "num_tokens": 45643772.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1662,
      "step_time": 13.335916545242071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 118.3125,
      "completions/mean_terminated_length": 118.3125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.26379984617233276,
      "epoch": 0.07702640111162576,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017720244359225035,
      "kl": 0.0015850586059968919,
      "learning_rate": 9.846039833256137e-07,
      "loss": 0.0001,
      "num_tokens": 45665745.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1663,
      "step_time": 14.801113799214363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 115.5625,
      "completions/mean_terminated_length": 115.5625,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.28533728420734406,
      "epoch": 0.07707271885132005,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018279367359355092,
      "kl": 0.0016257822280749679,
      "learning_rate": 9.845947197776748e-07,
      "loss": 0.0001,
      "num_tokens": 45687946.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1664,
      "step_time": 12.986000373959541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 150.9375,
      "completions/mean_terminated_length": 150.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3908330276608467,
      "epoch": 0.07711903659101436,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00139794556889683,
      "kl": 0.001606762147275731,
      "learning_rate": 9.84585456229736e-07,
      "loss": 0.0001,
      "num_tokens": 45739417.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1665,
      "step_time": 23.29428631067276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 123.875,
      "completions/mean_terminated_length": 123.875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.29769379273056984,
      "epoch": 0.07716535433070866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015150794060900807,
      "kl": 0.0017644141626078635,
      "learning_rate": 9.84576192681797e-07,
      "loss": 0.0001,
      "num_tokens": 45763639.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1666,
      "step_time": 14.191865853965282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 137.3125,
      "completions/mean_terminated_length": 137.3125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.19898555427789688,
      "epoch": 0.07721167207040297,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0067122201435267925,
      "kl": 0.0034142808872275054,
      "learning_rate": 9.845669291338582e-07,
      "loss": 0.0002,
      "num_tokens": 45794748.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1667,
      "step_time": 16.306269992142916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 132.6875,
      "completions/mean_terminated_length": 132.6875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3299274146556854,
      "epoch": 0.07725798981009727,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016279020346701145,
      "kl": 0.0014249913510866463,
      "learning_rate": 9.845576655859193e-07,
      "loss": 0.0001,
      "num_tokens": 45818695.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1668,
      "step_time": 15.443460620939732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 135.9375,
      "completions/mean_terminated_length": 135.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.245439812541008,
      "epoch": 0.07730430754979158,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001229665125720203,
      "kl": 0.0012690881558228284,
      "learning_rate": 9.845484020379804e-07,
      "loss": 0.0001,
      "num_tokens": 45838486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1669,
      "step_time": 14.933586858212948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.0,
      "completions/max_terminated_length": 123.0,
      "completions/mean_length": 110.5,
      "completions/mean_terminated_length": 110.5,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.2375088967382908,
      "epoch": 0.07735062528948587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012708210851997137,
      "kl": 0.0009623114892747253,
      "learning_rate": 9.845391384900417e-07,
      "loss": 0.0,
      "num_tokens": 45860350.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1670,
      "step_time": 12.734601717442274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 184.25,
      "completions/mean_terminated_length": 184.25,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3350832015275955,
      "epoch": 0.07739694302918018,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002224243711680174,
      "kl": 0.0026303930208086967,
      "learning_rate": 9.845298749421029e-07,
      "loss": 0.0001,
      "num_tokens": 45902258.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1671,
      "step_time": 24.80718930065632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 208.0625,
      "completions/mean_terminated_length": 208.0625,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.2840676084160805,
      "epoch": 0.07744326076887448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008224576595239341,
      "kl": 0.0009848253830568865,
      "learning_rate": 9.845206113941638e-07,
      "loss": 0.0,
      "num_tokens": 45930563.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1672,
      "step_time": 23.42478010803461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 146.75,
      "completions/mean_terminated_length": 146.75,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.17919356748461723,
      "epoch": 0.07748957850856879,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005466792266815901,
      "kl": 0.00216941034886986,
      "learning_rate": 9.845113478462251e-07,
      "loss": 0.0001,
      "num_tokens": 45957279.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 1673,
      "step_time": 18.26247502863407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 193.6875,
      "completions/mean_terminated_length": 193.6875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.34672409296035767,
      "epoch": 0.07753589624826308,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10089165717363358,
      "kl": 0.0020296200236771256,
      "learning_rate": 9.845020842982862e-07,
      "loss": 0.0926,
      "num_tokens": 45988762.0,
      "reward": 0.05708860233426094,
      "reward_std": 0.08745308220386505,
      "rewards/reward_func/mean": 0.05708860233426094,
      "rewards/reward_func/std": 0.08745308220386505,
      "step": 1674,
      "step_time": 23.20253485813737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 250.0625,
      "completions/mean_terminated_length": 250.0625,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "entropy": 0.20240335538983345,
      "epoch": 0.0775822139879574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012101448373869061,
      "kl": 0.0011594314710237086,
      "learning_rate": 9.844928207503474e-07,
      "loss": 0.0001,
      "num_tokens": 46024843.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1675,
      "step_time": 27.527564823627472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 207.125,
      "completions/mean_terminated_length": 207.125,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.2570352144539356,
      "epoch": 0.07762853172765169,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06474600732326508,
      "kl": 0.0013402775075519457,
      "learning_rate": 9.844835572024085e-07,
      "loss": -0.0174,
      "num_tokens": 46049229.0,
      "reward": 0.9291586875915527,
      "reward_std": 0.2500021159648895,
      "rewards/reward_func/mean": 0.9291586875915527,
      "rewards/reward_func/std": 0.2500021159648895,
      "step": 1676,
      "step_time": 22.279742319136858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 162.4375,
      "completions/mean_terminated_length": 162.4375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.3593992218375206,
      "epoch": 0.077674849467346,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014101880369707942,
      "kl": 0.0013620714307762682,
      "learning_rate": 9.844742936544696e-07,
      "loss": 0.0001,
      "num_tokens": 46074676.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1677,
      "step_time": 17.406505409628153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 131.9375,
      "completions/mean_terminated_length": 131.9375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.27665281295776367,
      "epoch": 0.0777211672070403,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016846642829477787,
      "kl": 0.0013541773951146752,
      "learning_rate": 9.844650301065307e-07,
      "loss": 0.0001,
      "num_tokens": 46105635.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1678,
      "step_time": 16.720541026443243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 121.4375,
      "completions/mean_terminated_length": 121.4375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.37001293897628784,
      "epoch": 0.0777674849467346,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001234093215316534,
      "kl": 0.0012913451646454632,
      "learning_rate": 9.844557665585919e-07,
      "loss": 0.0001,
      "num_tokens": 46138362.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1679,
      "step_time": 15.906921125948429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 144.25,
      "completions/mean_terminated_length": 144.25,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.36959002166986465,
      "epoch": 0.0778138026864289,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019340359140187502,
      "kl": 0.0018135556892957538,
      "learning_rate": 9.84446503010653e-07,
      "loss": 0.0001,
      "num_tokens": 46164350.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1680,
      "step_time": 17.01923667639494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 194.875,
      "completions/mean_terminated_length": 194.875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.34608144313097,
      "epoch": 0.07786012042612321,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10357584804296494,
      "kl": 0.0018900996656157076,
      "learning_rate": 9.844372394627141e-07,
      "loss": -0.0511,
      "num_tokens": 46198684.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 1681,
      "step_time": 22.219603832811117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3109801709651947,
      "epoch": 0.07790643816581751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035751787945628166,
      "kl": 0.0013909574190620333,
      "learning_rate": 9.844279759147752e-07,
      "loss": 0.0001,
      "num_tokens": 46229856.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1682,
      "step_time": 17.34016814827919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 155.3125,
      "completions/mean_terminated_length": 155.3125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.42632104456424713,
      "epoch": 0.07795275590551182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001590759726241231,
      "kl": 0.0018257541814818978,
      "learning_rate": 9.844187123668366e-07,
      "loss": 0.0001,
      "num_tokens": 46271845.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1683,
      "step_time": 21.376418214291334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 168.3125,
      "completions/mean_terminated_length": 168.3125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4322524890303612,
      "epoch": 0.07799907364520611,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030773428734391928,
      "kl": 0.00173371157143265,
      "learning_rate": 9.844094488188977e-07,
      "loss": 0.0001,
      "num_tokens": 46305066.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1684,
      "step_time": 20.002454344183207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 127.4375,
      "completions/mean_terminated_length": 127.4375,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2681782655417919,
      "epoch": 0.07804539138490042,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013541457010433078,
      "kl": 0.0009537218720652163,
      "learning_rate": 9.844001852709586e-07,
      "loss": 0.0,
      "num_tokens": 46335537.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1685,
      "step_time": 16.65060442686081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 147.25,
      "completions/mean_terminated_length": 147.25,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2002566084265709,
      "epoch": 0.07809170912459472,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11434628814458847,
      "kl": 0.0013527112314477563,
      "learning_rate": 9.843909217230197e-07,
      "loss": -0.0046,
      "num_tokens": 46356949.0,
      "reward": 0.366998553276062,
      "reward_std": 0.18222030997276306,
      "rewards/reward_func/mean": 0.366998553276062,
      "rewards/reward_func/std": 0.18222030997276306,
      "step": 1686,
      "step_time": 16.181255109608173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 134.1875,
      "completions/mean_terminated_length": 134.1875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.25187139958143234,
      "epoch": 0.07813802686428903,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027301819063723087,
      "kl": 0.0014639183063991368,
      "learning_rate": 9.84381658175081e-07,
      "loss": 0.0001,
      "num_tokens": 46376760.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1687,
      "step_time": 14.960896357893944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 128.125,
      "completions/mean_terminated_length": 128.125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.33778195083141327,
      "epoch": 0.07818434460398332,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001593102584592998,
      "kl": 0.0016300349379889667,
      "learning_rate": 9.843723946271422e-07,
      "loss": 0.0001,
      "num_tokens": 46412586.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1688,
      "step_time": 17.47895924001932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 138.6875,
      "completions/mean_terminated_length": 138.6875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.43217572569847107,
      "epoch": 0.07823066234367763,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013693617656826973,
      "kl": 0.0015838368562981486,
      "learning_rate": 9.843631310792033e-07,
      "loss": 0.0001,
      "num_tokens": 46436965.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1689,
      "step_time": 14.800542384386063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 140.25,
      "completions/mean_terminated_length": 140.25,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.38462790846824646,
      "epoch": 0.07827698008337193,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022047506645321846,
      "kl": 0.0020656742271967232,
      "learning_rate": 9.843538675312645e-07,
      "loss": 0.0001,
      "num_tokens": 46470089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1690,
      "step_time": 18.441659960895777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 439.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 343.1875,
      "completions/mean_terminated_length": 343.1875,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "entropy": 0.32002317160367966,
      "epoch": 0.07832329782306624,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05422141030430794,
      "kl": 0.001787881599739194,
      "learning_rate": 9.843446039833256e-07,
      "loss": -0.1295,
      "num_tokens": 46499356.0,
      "reward": 0.6308025121688843,
      "reward_std": 0.39480242133140564,
      "rewards/reward_func/mean": 0.6308025121688843,
      "rewards/reward_func/std": 0.39480242133140564,
      "step": 1691,
      "step_time": 36.27641510590911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 188.25,
      "completions/mean_terminated_length": 188.25,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3893887996673584,
      "epoch": 0.07836961556276054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002944176783785224,
      "kl": 0.0020869087893515825,
      "learning_rate": 9.843353404353867e-07,
      "loss": 0.0001,
      "num_tokens": 46530816.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1692,
      "step_time": 21.236405465751886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.29504361748695374,
      "epoch": 0.07841593330245485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019446397200226784,
      "kl": 0.001616210414795205,
      "learning_rate": 9.843260768874478e-07,
      "loss": 0.0001,
      "num_tokens": 46560142.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1693,
      "step_time": 18.738793417811394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 126.8125,
      "completions/mean_terminated_length": 126.8125,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3024235740303993,
      "epoch": 0.07846225104214914,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009588732500560582,
      "kl": 0.001039199487422593,
      "learning_rate": 9.84316813339509e-07,
      "loss": 0.0001,
      "num_tokens": 46580699.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1694,
      "step_time": 14.138744950294495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.4446135610342026,
      "epoch": 0.07850856878184345,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0046493783593177795,
      "kl": 0.0029429663554765284,
      "learning_rate": 9.8430754979157e-07,
      "loss": 0.0001,
      "num_tokens": 46625347.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1695,
      "step_time": 23.11709763109684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 152.3125,
      "completions/mean_terminated_length": 152.3125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.36051344126462936,
      "epoch": 0.07855488652153775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001356164226308465,
      "kl": 0.0017850837029982358,
      "learning_rate": 9.842982862436314e-07,
      "loss": 0.0001,
      "num_tokens": 46647416.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1696,
      "step_time": 16.97948555275798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 151.6875,
      "completions/mean_terminated_length": 151.6875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.14736872911453247,
      "epoch": 0.07860120426123206,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07276488840579987,
      "kl": 0.003640395647380501,
      "learning_rate": 9.842890226956923e-07,
      "loss": -0.0075,
      "num_tokens": 46669267.0,
      "reward": 0.9060009717941284,
      "reward_std": 0.18091386556625366,
      "rewards/reward_func/mean": 0.9060009717941284,
      "rewards/reward_func/std": 0.18091386556625366,
      "step": 1697,
      "step_time": 15.534895148128271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 133.625,
      "completions/mean_terminated_length": 133.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3113630935549736,
      "epoch": 0.07864752200092635,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003892549080774188,
      "kl": 0.0018110026721842587,
      "learning_rate": 9.842797591477535e-07,
      "loss": 0.0001,
      "num_tokens": 46698845.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1698,
      "step_time": 15.903801303356886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 167.125,
      "completions/mean_terminated_length": 167.125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.2357831597328186,
      "epoch": 0.07869383974062066,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11127041280269623,
      "kl": 0.0028912353445775807,
      "learning_rate": 9.842704955998146e-07,
      "loss": 0.0851,
      "num_tokens": 46720431.0,
      "reward": 0.1674453616142273,
      "reward_std": 0.06536397337913513,
      "rewards/reward_func/mean": 0.1674453616142273,
      "rewards/reward_func/std": 0.06536397337913513,
      "step": 1699,
      "step_time": 21.52933993190527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.14909964427351952,
      "epoch": 0.07874015748031496,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006626087124459445,
      "kl": 0.0005348598788259551,
      "learning_rate": 9.84261232051876e-07,
      "loss": 0.0,
      "num_tokens": 46749582.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 1700,
      "step_time": 17.675500009208918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 127.0625,
      "completions/mean_terminated_length": 127.0625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3040238469839096,
      "epoch": 0.07878647522000927,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001297865412198007,
      "kl": 0.0013960163632873446,
      "learning_rate": 9.84251968503937e-07,
      "loss": 0.0001,
      "num_tokens": 46778047.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1701,
      "step_time": 15.285749927163124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 218.9375,
      "completions/mean_terminated_length": 218.9375,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.18902039900422096,
      "epoch": 0.07883279295970357,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023403866216540337,
      "kl": 0.0017285541980527341,
      "learning_rate": 9.842427049559982e-07,
      "loss": 0.0001,
      "num_tokens": 46800206.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1702,
      "step_time": 21.16633603721857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 233.8125,
      "completions/mean_terminated_length": 233.8125,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.3055119514465332,
      "epoch": 0.07887911069939788,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08222231268882751,
      "kl": 0.0018914344254881144,
      "learning_rate": 9.842334414080593e-07,
      "loss": -0.0129,
      "num_tokens": 46836651.0,
      "reward": 0.16383880376815796,
      "reward_std": 0.35467827320098877,
      "rewards/reward_func/mean": 0.16383880376815796,
      "rewards/reward_func/std": 0.35467830300331116,
      "step": 1703,
      "step_time": 27.084114365279675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 146.25,
      "completions/mean_terminated_length": 146.25,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3746067136526108,
      "epoch": 0.07892542843909217,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001366176176816225,
      "kl": 0.0018012887449003756,
      "learning_rate": 9.842241778601204e-07,
      "loss": 0.0001,
      "num_tokens": 46872207.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1704,
      "step_time": 19.433010537177324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 185.5625,
      "completions/mean_terminated_length": 185.5625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.41927726566791534,
      "epoch": 0.07897174617878648,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00419262982904911,
      "kl": 0.0023562940768897533,
      "learning_rate": 9.842149143121815e-07,
      "loss": 0.0001,
      "num_tokens": 46898568.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1705,
      "step_time": 18.908721026033163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 127.875,
      "completions/mean_terminated_length": 127.875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3284745365381241,
      "epoch": 0.07901806391848078,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015150196850299835,
      "kl": 0.0015281571249943227,
      "learning_rate": 9.842056507642427e-07,
      "loss": 0.0001,
      "num_tokens": 46922838.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1706,
      "step_time": 14.719692457467318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 167.5625,
      "completions/mean_terminated_length": 167.5625,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.35932251065969467,
      "epoch": 0.07906438165817509,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002645768690854311,
      "kl": 0.002208069316111505,
      "learning_rate": 9.841963872163038e-07,
      "loss": 0.0001,
      "num_tokens": 46945167.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1707,
      "step_time": 17.174181506037712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 142.125,
      "completions/mean_terminated_length": 142.125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.21813251078128815,
      "epoch": 0.07911069939786938,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010103777749463916,
      "kl": 0.0010198226082138717,
      "learning_rate": 9.84187123668365e-07,
      "loss": 0.0001,
      "num_tokens": 46964801.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1708,
      "step_time": 15.079619221389294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 495.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 279.6875,
      "completions/mean_terminated_length": 279.6875,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "entropy": 0.5472236573696136,
      "epoch": 0.07915701713756369,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0745278149843216,
      "kl": 0.0019589700386859477,
      "learning_rate": 9.84177860120426e-07,
      "loss": 0.1951,
      "num_tokens": 46991484.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1709,
      "step_time": 38.763111498206854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 162.375,
      "completions/mean_terminated_length": 162.375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.17142558470368385,
      "epoch": 0.07920333487725799,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008708245120942593,
      "kl": 0.004739268129924312,
      "learning_rate": 9.841685965724872e-07,
      "loss": 0.0002,
      "num_tokens": 47038802.0,
      "reward": 0.8337529301643372,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8337529301643372,
      "rewards/reward_func/std": 0.0,
      "step": 1710,
      "step_time": 23.529833510518074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 190.125,
      "completions/mean_terminated_length": 190.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.2060224413871765,
      "epoch": 0.0792496526169523,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10727056860923767,
      "kl": 0.0009348559251520783,
      "learning_rate": 9.841593330245483e-07,
      "loss": -0.0368,
      "num_tokens": 47064644.0,
      "reward": 0.9850083589553833,
      "reward_std": 0.03223112225532532,
      "rewards/reward_func/mean": 0.9850083589553833,
      "rewards/reward_func/std": 0.03223112225532532,
      "step": 1711,
      "step_time": 21.429081067442894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 117.0,
      "completions/max_terminated_length": 117.0,
      "completions/mean_length": 104.375,
      "completions/mean_terminated_length": 104.375,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.23032863065600395,
      "epoch": 0.0792959703566466,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013954452006146312,
      "kl": 0.0011873962357640266,
      "learning_rate": 9.841500694766094e-07,
      "loss": 0.0001,
      "num_tokens": 47083994.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1712,
      "step_time": 11.449270065873861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 372.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 282.375,
      "completions/mean_terminated_length": 282.375,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "entropy": 0.24487724155187607,
      "epoch": 0.0793422880963409,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06413564085960388,
      "kl": 0.0019074836163781583,
      "learning_rate": 9.841408059286708e-07,
      "loss": -0.0305,
      "num_tokens": 47124448.0,
      "reward": 0.5864511728286743,
      "reward_std": 0.11027967929840088,
      "rewards/reward_func/mean": 0.5864511728286743,
      "rewards/reward_func/std": 0.11027967929840088,
      "step": 1713,
      "step_time": 33.45283553749323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 149.25,
      "completions/mean_terminated_length": 149.25,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3016955330967903,
      "epoch": 0.0793886058360352,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009356994414702058,
      "kl": 0.0010292467050021514,
      "learning_rate": 9.841315423807319e-07,
      "loss": 0.0001,
      "num_tokens": 47159668.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1714,
      "step_time": 19.849836815148592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 155.875,
      "completions/mean_terminated_length": 155.875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3925165981054306,
      "epoch": 0.07943492357572951,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002282223431393504,
      "kl": 0.0024772547476459295,
      "learning_rate": 9.841222788327928e-07,
      "loss": 0.0001,
      "num_tokens": 47211378.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1715,
      "step_time": 23.54809008166194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 130.25,
      "completions/mean_terminated_length": 130.25,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.23693326488137245,
      "epoch": 0.0794812413154238,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012495475821197033,
      "kl": 0.001155626232502982,
      "learning_rate": 9.84113015284854e-07,
      "loss": 0.0001,
      "num_tokens": 47231062.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1716,
      "step_time": 14.24133886769414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 142.0,
      "completions/mean_terminated_length": 142.0,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3047955483198166,
      "epoch": 0.07952755905511812,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009511947864666581,
      "kl": 0.0011468870943645015,
      "learning_rate": 9.841037517369153e-07,
      "loss": 0.0001,
      "num_tokens": 47252758.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1717,
      "step_time": 16.200883217155933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 226.4375,
      "completions/mean_terminated_length": 226.4375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.2305188961327076,
      "epoch": 0.07957387679481241,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00238050171174109,
      "kl": 0.001595924055436626,
      "learning_rate": 9.840944881889764e-07,
      "loss": 0.0001,
      "num_tokens": 47283325.0,
      "reward": 0.6466840505599976,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6466840505599976,
      "rewards/reward_func/std": 0.0,
      "step": 1718,
      "step_time": 22.60474267974496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 133.875,
      "completions/mean_terminated_length": 133.875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.30449212342500687,
      "epoch": 0.07962019453450672,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012453654780983925,
      "kl": 0.0012665226822718978,
      "learning_rate": 9.840852246410375e-07,
      "loss": 0.0001,
      "num_tokens": 47305531.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1719,
      "step_time": 14.574725233018398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 168.875,
      "completions/mean_terminated_length": 168.875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.25209128856658936,
      "epoch": 0.07966651227420102,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1827867031097412,
      "kl": 0.004445167491212487,
      "learning_rate": 9.840759610930986e-07,
      "loss": -0.0504,
      "num_tokens": 47334121.0,
      "reward": 0.8146799206733704,
      "reward_std": 0.31801846623420715,
      "rewards/reward_func/mean": 0.8146799206733704,
      "rewards/reward_func/std": 0.31801849603652954,
      "step": 1720,
      "step_time": 20.539701025933027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 162.25,
      "completions/mean_terminated_length": 162.25,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.3810490518808365,
      "epoch": 0.07971283001389533,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005509552545845509,
      "kl": 0.0023259613662958145,
      "learning_rate": 9.840666975451598e-07,
      "loss": 0.0001,
      "num_tokens": 47365661.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1721,
      "step_time": 18.761831019073725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 227.3125,
      "completions/mean_terminated_length": 227.3125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.423902727663517,
      "epoch": 0.07975914775358962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020497653167694807,
      "kl": 0.001853982568718493,
      "learning_rate": 9.840574339972209e-07,
      "loss": 0.0001,
      "num_tokens": 47398066.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1722,
      "step_time": 28.940093513578176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 171.1875,
      "completions/mean_terminated_length": 171.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.38299621641635895,
      "epoch": 0.07980546549328393,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09138719737529755,
      "kl": 0.0015985459904186428,
      "learning_rate": 9.84048170449282e-07,
      "loss": 0.0412,
      "num_tokens": 47422917.0,
      "reward": 0.033777061849832535,
      "reward_std": 0.13510826230049133,
      "rewards/reward_func/mean": 0.033777061849832535,
      "rewards/reward_func/std": 0.13510826230049133,
      "step": 1723,
      "step_time": 20.05716634169221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 152.875,
      "completions/mean_terminated_length": 152.875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.446855790913105,
      "epoch": 0.07985178323297823,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012937851715832949,
      "kl": 0.001694143924396485,
      "learning_rate": 9.840389069013431e-07,
      "loss": 0.0001,
      "num_tokens": 47465747.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1724,
      "step_time": 20.9438929669559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 146.0,
      "completions/mean_terminated_length": 146.0,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.33595968037843704,
      "epoch": 0.07989810097267254,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026925739366561174,
      "kl": 0.0018194759904872626,
      "learning_rate": 9.840296433534043e-07,
      "loss": 0.0001,
      "num_tokens": 47487043.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1725,
      "step_time": 17.122764468193054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 201.5,
      "completions/mean_terminated_length": 201.5,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.18933162838220596,
      "epoch": 0.07994441871236684,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08614408224821091,
      "kl": 0.0019587448332458735,
      "learning_rate": 9.840203798054654e-07,
      "loss": 0.0678,
      "num_tokens": 47508763.0,
      "reward": 0.6217309236526489,
      "reward_std": 0.004086330533027649,
      "rewards/reward_func/mean": 0.6217309236526489,
      "rewards/reward_func/std": 0.004086339380592108,
      "step": 1726,
      "step_time": 22.153468146920204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 120.0625,
      "completions/mean_terminated_length": 120.0625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.23470370844006538,
      "epoch": 0.07999073645206115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011941755656152964,
      "kl": 0.001213880255818367,
      "learning_rate": 9.840111162575267e-07,
      "loss": 0.0001,
      "num_tokens": 47528012.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1727,
      "step_time": 13.319803450256586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 190.375,
      "completions/mean_terminated_length": 190.375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.36268921941518784,
      "epoch": 0.08003705419175544,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014123907312750816,
      "kl": 0.0016592381580267102,
      "learning_rate": 9.840018527095876e-07,
      "loss": 0.0001,
      "num_tokens": 47560370.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1728,
      "step_time": 23.27802597731352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 129.5,
      "completions/mean_terminated_length": 129.5,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.25399216264486313,
      "epoch": 0.08008337193144975,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004883928690105677,
      "kl": 0.0017067000735551119,
      "learning_rate": 9.839925891616488e-07,
      "loss": 0.0001,
      "num_tokens": 47582538.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1729,
      "step_time": 14.92527623474598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 137.6875,
      "completions/mean_terminated_length": 137.6875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3258182033896446,
      "epoch": 0.08012968967114405,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011030449531972408,
      "kl": 0.0015004372398834676,
      "learning_rate": 9.8398332561371e-07,
      "loss": 0.0001,
      "num_tokens": 47608437.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1730,
      "step_time": 16.201912455260754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 175.4375,
      "completions/mean_terminated_length": 175.4375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.2518988251686096,
      "epoch": 0.08017600741083836,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09052787721157074,
      "kl": 0.0013592242321465164,
      "learning_rate": 9.839740620657712e-07,
      "loss": -0.0693,
      "num_tokens": 47629500.0,
      "reward": 0.809805154800415,
      "reward_std": 0.29135626554489136,
      "rewards/reward_func/mean": 0.809805154800415,
      "rewards/reward_func/std": 0.29135629534721375,
      "step": 1731,
      "step_time": 18.97813592478633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 150.375,
      "completions/mean_terminated_length": 150.375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.40458470582962036,
      "epoch": 0.08022232515053265,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010849455138668418,
      "kl": 0.001426139788236469,
      "learning_rate": 9.839647985178323e-07,
      "loss": 0.0001,
      "num_tokens": 47652658.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1732,
      "step_time": 18.86000870913267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 169.3125,
      "completions/mean_terminated_length": 169.3125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.2905171662569046,
      "epoch": 0.08026864289022696,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11406318098306656,
      "kl": 0.0018576665024738759,
      "learning_rate": 9.839555349698935e-07,
      "loss": -0.0569,
      "num_tokens": 47673703.0,
      "reward": 0.7525621652603149,
      "reward_std": 0.3050681948661804,
      "rewards/reward_func/mean": 0.7525621652603149,
      "rewards/reward_func/std": 0.3050681948661804,
      "step": 1733,
      "step_time": 16.87060249224305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 129.4375,
      "completions/mean_terminated_length": 129.4375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2552470825612545,
      "epoch": 0.08031496062992126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012431704672053456,
      "kl": 0.0010684909793781117,
      "learning_rate": 9.839462714219546e-07,
      "loss": 0.0001,
      "num_tokens": 47695742.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1734,
      "step_time": 13.730560723692179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 298.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 252.0,
      "completions/mean_terminated_length": 252.0,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "entropy": 0.24373339861631393,
      "epoch": 0.08036127836961557,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10387435555458069,
      "kl": 0.0017696096329018474,
      "learning_rate": 9.839370078740157e-07,
      "loss": -0.041,
      "num_tokens": 47731102.0,
      "reward": 0.6591430902481079,
      "reward_std": 0.09329462051391602,
      "rewards/reward_func/mean": 0.6591430902481079,
      "rewards/reward_func/std": 0.09329462051391602,
      "step": 1735,
      "step_time": 27.556267201900482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 434.0,
      "completions/max_terminated_length": 434.0,
      "completions/mean_length": 253.5625,
      "completions/mean_terminated_length": 253.5625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.43792251497507095,
      "epoch": 0.08040759610930986,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08416919410228729,
      "kl": 0.001997353887418285,
      "learning_rate": 9.839277443260768e-07,
      "loss": -0.2162,
      "num_tokens": 47771975.0,
      "reward": 0.006604422815144062,
      "reward_std": 0.012659032829105854,
      "rewards/reward_func/mean": 0.006604422815144062,
      "rewards/reward_func/std": 0.012659032829105854,
      "step": 1736,
      "step_time": 38.68926587700844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 136.375,
      "completions/mean_terminated_length": 136.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2899010255932808,
      "epoch": 0.08045391384900417,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010311250807717443,
      "kl": 0.0012446382024791092,
      "learning_rate": 9.83918480778138e-07,
      "loss": 0.0001,
      "num_tokens": 47805373.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1737,
      "step_time": 17.425171364098787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 172.75,
      "completions/mean_terminated_length": 172.75,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3713117316365242,
      "epoch": 0.08050023158869847,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00268686399795115,
      "kl": 0.0017970122571568936,
      "learning_rate": 9.83909217230199e-07,
      "loss": 0.0001,
      "num_tokens": 47826905.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1738,
      "step_time": 18.37579194828868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 139.5,
      "completions/mean_terminated_length": 139.5,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2326408438384533,
      "epoch": 0.08054654932839278,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019349164795130491,
      "kl": 0.0018987695802934468,
      "learning_rate": 9.838999536822602e-07,
      "loss": 0.0001,
      "num_tokens": 47847873.0,
      "reward": 0.0018635177984833717,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0018635177984833717,
      "rewards/reward_func/std": 0.0,
      "step": 1739,
      "step_time": 16.576042093336582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 186.5625,
      "completions/mean_terminated_length": 186.5625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.32996729016304016,
      "epoch": 0.08059286706808708,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08055779337882996,
      "kl": 0.0017422466480638832,
      "learning_rate": 9.838906901343213e-07,
      "loss": -0.001,
      "num_tokens": 47869818.0,
      "reward": 0.023791268467903137,
      "reward_std": 0.0015891696093603969,
      "rewards/reward_func/mean": 0.023791268467903137,
      "rewards/reward_func/std": 0.0015891696093603969,
      "step": 1740,
      "step_time": 20.29440288618207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 120.4375,
      "completions/mean_terminated_length": 120.4375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.29793283343315125,
      "epoch": 0.08063918480778139,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008918229723349214,
      "kl": 0.001052564402925782,
      "learning_rate": 9.838814265863825e-07,
      "loss": 0.0001,
      "num_tokens": 47890961.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1741,
      "step_time": 13.468116946518421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 138.375,
      "completions/mean_terminated_length": 138.375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.17352503538131714,
      "epoch": 0.08068550254747568,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001970309065654874,
      "kl": 0.0015197321772575378,
      "learning_rate": 9.838721630384436e-07,
      "loss": 0.0001,
      "num_tokens": 47916775.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1742,
      "step_time": 15.390880346298218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 156.3125,
      "completions/mean_terminated_length": 156.3125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3092532157897949,
      "epoch": 0.08073182028716999,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022548986598849297,
      "kl": 0.0017885872512124479,
      "learning_rate": 9.83862899490505e-07,
      "loss": 0.0001,
      "num_tokens": 47939660.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1743,
      "step_time": 17.08356538042426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 161.875,
      "completions/mean_terminated_length": 161.875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2483629770576954,
      "epoch": 0.08077813802686429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010942198568955064,
      "kl": 0.0010861923801712692,
      "learning_rate": 9.83853635942566e-07,
      "loss": 0.0001,
      "num_tokens": 47963962.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 1744,
      "step_time": 18.27033767849207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 147.875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2996247857809067,
      "epoch": 0.0808244557665586,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002985493279993534,
      "kl": 0.0020667205681093037,
      "learning_rate": 9.838443723946272e-07,
      "loss": 0.0001,
      "num_tokens": 47984328.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1745,
      "step_time": 16.176025312393904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 133.75,
      "completions/mean_terminated_length": 133.75,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.370588019490242,
      "epoch": 0.0808707735062529,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020940375979989767,
      "kl": 0.0019824077317025512,
      "learning_rate": 9.83835108846688e-07,
      "loss": 0.0001,
      "num_tokens": 48008948.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1746,
      "step_time": 16.244132790714502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 125.375,
      "completions/mean_terminated_length": 125.375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2751948833465576,
      "epoch": 0.0809170912459472,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013756590196862817,
      "kl": 0.0014317425666376948,
      "learning_rate": 9.838258452987494e-07,
      "loss": 0.0001,
      "num_tokens": 48029546.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1747,
      "step_time": 14.355286739766598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 340.0,
      "completions/max_terminated_length": 340.0,
      "completions/mean_length": 221.5,
      "completions/mean_terminated_length": 221.5,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.2890290319919586,
      "epoch": 0.0809634089856415,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09583621472120285,
      "kl": 0.002892935706768185,
      "learning_rate": 9.838165817508106e-07,
      "loss": -0.013,
      "num_tokens": 48051314.0,
      "reward": 0.8611046075820923,
      "reward_std": 0.33824291825294495,
      "rewards/reward_func/mean": 0.8611046075820923,
      "rewards/reward_func/std": 0.33824291825294495,
      "step": 1748,
      "step_time": 27.2410273514688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 174.25,
      "completions/mean_terminated_length": 174.25,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.3782690018415451,
      "epoch": 0.08100972672533581,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019277514657005668,
      "kl": 0.0020270912791602314,
      "learning_rate": 9.838073182028717e-07,
      "loss": 0.0001,
      "num_tokens": 48072854.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1749,
      "step_time": 19.19731855392456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 174.5625,
      "completions/mean_terminated_length": 174.5625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.34312743693590164,
      "epoch": 0.0810560444650301,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017144788289442658,
      "kl": 0.001481565908761695,
      "learning_rate": 9.837980546549328e-07,
      "loss": 0.0001,
      "num_tokens": 48104719.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1750,
      "step_time": 22.33763938769698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 178.375,
      "completions/mean_terminated_length": 178.375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.1942046657204628,
      "epoch": 0.08110236220472442,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08009286224842072,
      "kl": 0.0013910191337345168,
      "learning_rate": 9.83788791106994e-07,
      "loss": -0.0371,
      "num_tokens": 48133925.0,
      "reward": 0.5303791761398315,
      "reward_std": 0.20773735642433167,
      "rewards/reward_func/mean": 0.5303791761398315,
      "rewards/reward_func/std": 0.20773734152317047,
      "step": 1751,
      "step_time": 19.963861864060163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.0,
      "completions/max_terminated_length": 123.0,
      "completions/mean_length": 112.0625,
      "completions/mean_terminated_length": 112.0625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2743670344352722,
      "epoch": 0.08114867994441871,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011545876041054726,
      "kl": 0.0012253207387402654,
      "learning_rate": 9.83779527559055e-07,
      "loss": 0.0001,
      "num_tokens": 48154982.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1752,
      "step_time": 12.302139397710562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 142.75,
      "completions/mean_terminated_length": 142.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.30697520077228546,
      "epoch": 0.08119499768411302,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002024251502007246,
      "kl": 0.0017657809075899422,
      "learning_rate": 9.837702640111162e-07,
      "loss": 0.0001,
      "num_tokens": 48189938.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1753,
      "step_time": 18.834266159683466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 157.5625,
      "completions/mean_terminated_length": 157.5625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.21398412063717842,
      "epoch": 0.08124131542380732,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08869022876024246,
      "kl": 0.0008632429380668327,
      "learning_rate": 9.837610004631773e-07,
      "loss": -0.0103,
      "num_tokens": 48214203.0,
      "reward": 0.874308168888092,
      "reward_std": 0.03275489807128906,
      "rewards/reward_func/mean": 0.874308168888092,
      "rewards/reward_func/std": 0.03275489807128906,
      "step": 1754,
      "step_time": 17.75719940289855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 124.875,
      "completions/mean_terminated_length": 124.875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3036256954073906,
      "epoch": 0.08128763316350163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011545493034645915,
      "kl": 0.0013409795064944774,
      "learning_rate": 9.837517369152384e-07,
      "loss": 0.0001,
      "num_tokens": 48240601.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1755,
      "step_time": 15.568399280309677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 187.125,
      "completions/mean_terminated_length": 187.125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3096730038523674,
      "epoch": 0.08133395090319592,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10500717908143997,
      "kl": 0.001821783254854381,
      "learning_rate": 9.837424733672995e-07,
      "loss": 0.0119,
      "num_tokens": 48263531.0,
      "reward": 0.8728713393211365,
      "reward_std": 0.23276567459106445,
      "rewards/reward_func/mean": 0.8728713393211365,
      "rewards/reward_func/std": 0.23276568949222565,
      "step": 1756,
      "step_time": 20.194113560020924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 195.9375,
      "completions/mean_terminated_length": 195.9375,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.1552904136478901,
      "epoch": 0.08138026864289023,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013945831451565027,
      "kl": 0.0010427927481941879,
      "learning_rate": 9.837332098193609e-07,
      "loss": 0.0001,
      "num_tokens": 48288202.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1757,
      "step_time": 20.05296416208148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 124.75,
      "completions/mean_terminated_length": 124.75,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.32303228229284286,
      "epoch": 0.08142658638258453,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024470367934554815,
      "kl": 0.0021272314770612866,
      "learning_rate": 9.837239462714218e-07,
      "loss": 0.0001,
      "num_tokens": 48309494.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1758,
      "step_time": 13.834779296070337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 147.6875,
      "completions/mean_terminated_length": 147.6875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3536003828048706,
      "epoch": 0.08147290412227884,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012544355122372508,
      "kl": 0.0014445357373915613,
      "learning_rate": 9.83714682723483e-07,
      "loss": 0.0001,
      "num_tokens": 48333233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1759,
      "step_time": 16.417766705155373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 169.625,
      "completions/mean_terminated_length": 169.625,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.33668191730976105,
      "epoch": 0.08151922186197313,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013698124093934894,
      "kl": 0.0015107369981706142,
      "learning_rate": 9.837054191755443e-07,
      "loss": 0.0001,
      "num_tokens": 48358667.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1760,
      "step_time": 17.65128844976425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 131.5625,
      "completions/mean_terminated_length": 131.5625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3014110028743744,
      "epoch": 0.08156553960166744,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027149177622050047,
      "kl": 0.0019454198190942407,
      "learning_rate": 9.836961556276054e-07,
      "loss": 0.0001,
      "num_tokens": 48380948.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1761,
      "step_time": 14.282371658831835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 209.5625,
      "completions/mean_terminated_length": 209.5625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.4765824005007744,
      "epoch": 0.08161185734136174,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034426741767674685,
      "kl": 0.003096617292612791,
      "learning_rate": 9.836868920796665e-07,
      "loss": 0.0002,
      "num_tokens": 48407821.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1762,
      "step_time": 23.894170958548784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 130.9375,
      "completions/mean_terminated_length": 130.9375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.29027804732322693,
      "epoch": 0.08165817508105605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001258092699572444,
      "kl": 0.0011905626743100584,
      "learning_rate": 9.836776285317276e-07,
      "loss": 0.0001,
      "num_tokens": 48432204.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1763,
      "step_time": 15.331847336143255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 159.875,
      "completions/mean_terminated_length": 159.875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.31587763130664825,
      "epoch": 0.08170449282075035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023555525112897158,
      "kl": 0.001577354152686894,
      "learning_rate": 9.836683649837888e-07,
      "loss": 0.0001,
      "num_tokens": 48454378.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1764,
      "step_time": 16.64985018968582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.4026193544268608,
      "epoch": 0.08175081056044466,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000793006329331547,
      "kl": 0.001331146020675078,
      "learning_rate": 9.836591014358499e-07,
      "loss": 0.0001,
      "num_tokens": 48484691.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1765,
      "step_time": 18.201582103967667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 178.8125,
      "completions/mean_terminated_length": 178.8125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3697373867034912,
      "epoch": 0.08179712830013895,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003388105658814311,
      "kl": 0.0022740360582247376,
      "learning_rate": 9.83649837887911e-07,
      "loss": 0.0001,
      "num_tokens": 48518400.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1766,
      "step_time": 20.224620919674635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 156.125,
      "completions/mean_terminated_length": 156.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.19097772240638733,
      "epoch": 0.08184344603983326,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0848284438252449,
      "kl": 0.0007946285040816292,
      "learning_rate": 9.836405743399721e-07,
      "loss": -0.0531,
      "num_tokens": 48553698.0,
      "reward": 0.9223366379737854,
      "reward_std": 0.11897119879722595,
      "rewards/reward_func/mean": 0.9223366379737854,
      "rewards/reward_func/std": 0.11897119134664536,
      "step": 1767,
      "step_time": 18.881062146276236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.305821530520916,
      "epoch": 0.08188976377952756,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003025626763701439,
      "kl": 0.003023940953426063,
      "learning_rate": 9.836313107920333e-07,
      "loss": 0.0002,
      "num_tokens": 48592005.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1768,
      "step_time": 19.679544236510992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 221.4375,
      "completions/mean_terminated_length": 221.4375,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "entropy": 0.24040383473038673,
      "epoch": 0.08193608151922187,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1213071346282959,
      "kl": 0.0017236094281543046,
      "learning_rate": 9.836220472440944e-07,
      "loss": 0.0285,
      "num_tokens": 48616652.0,
      "reward": 0.9858227372169495,
      "reward_std": 0.0304801557213068,
      "rewards/reward_func/mean": 0.9858227372169495,
      "rewards/reward_func/std": 0.030480151996016502,
      "step": 1769,
      "step_time": 21.561090268194675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 134.0,
      "completions/mean_terminated_length": 134.0,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.27792296558618546,
      "epoch": 0.08198239925891616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009946874342858791,
      "kl": 0.00113045732723549,
      "learning_rate": 9.836127836961557e-07,
      "loss": 0.0001,
      "num_tokens": 48638908.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1770,
      "step_time": 16.195545982569456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 203.25,
      "completions/mean_terminated_length": 203.25,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.39546895772218704,
      "epoch": 0.08202871699861047,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09161561727523804,
      "kl": 0.0020151612407062203,
      "learning_rate": 9.836035201482166e-07,
      "loss": -0.1318,
      "num_tokens": 48661648.0,
      "reward": 0.12096919119358063,
      "reward_std": 0.3307603895664215,
      "rewards/reward_func/mean": 0.12096919119358063,
      "rewards/reward_func/std": 0.3307604193687439,
      "step": 1771,
      "step_time": 23.496229242533445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 154.6875,
      "completions/mean_terminated_length": 154.6875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4003916531801224,
      "epoch": 0.08207503473830477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014100077096372843,
      "kl": 0.0020715085265692323,
      "learning_rate": 9.835942566002778e-07,
      "loss": 0.0001,
      "num_tokens": 48693675.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1772,
      "step_time": 21.215330470353365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 150.75,
      "completions/mean_terminated_length": 150.75,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.29009925574064255,
      "epoch": 0.08212135247799908,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013742835726588964,
      "kl": 0.0011427075078245252,
      "learning_rate": 9.83584993052339e-07,
      "loss": 0.0001,
      "num_tokens": 48715879.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1773,
      "step_time": 15.747868739068508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 197.375,
      "completions/mean_terminated_length": 197.375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.4076923429965973,
      "epoch": 0.08216767021769338,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033543696627020836,
      "kl": 0.002331511495867744,
      "learning_rate": 9.835757295044002e-07,
      "loss": 0.0001,
      "num_tokens": 48739789.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1774,
      "step_time": 22.986718233674765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 199.125,
      "completions/mean_terminated_length": 199.125,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3451576381921768,
      "epoch": 0.08221398795738769,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031812279485166073,
      "kl": 0.002530711644794792,
      "learning_rate": 9.835664659564613e-07,
      "loss": 0.0001,
      "num_tokens": 48767231.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1775,
      "step_time": 26.703542787581682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 162.75,
      "completions/mean_terminated_length": 162.75,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.22918789088726044,
      "epoch": 0.08226030569708198,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011042740661650896,
      "kl": 0.0012245680554769933,
      "learning_rate": 9.835572024085225e-07,
      "loss": 0.0001,
      "num_tokens": 48792283.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 1776,
      "step_time": 17.573363177478313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 293.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 246.9375,
      "completions/mean_terminated_length": 246.9375,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.2423914223909378,
      "epoch": 0.08230662343677629,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24949026107788086,
      "kl": 0.006801836425438523,
      "learning_rate": 9.835479388605836e-07,
      "loss": 0.0428,
      "num_tokens": 48827066.0,
      "reward": 0.8991250991821289,
      "reward_std": 0.23976711928844452,
      "rewards/reward_func/mean": 0.8991250991821289,
      "rewards/reward_func/std": 0.2397671341896057,
      "step": 1777,
      "step_time": 27.01763205602765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 239.75,
      "completions/mean_terminated_length": 239.75,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.2516292668879032,
      "epoch": 0.08235294117647059,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002423672005534172,
      "kl": 0.0017647942004259676,
      "learning_rate": 9.835386753126447e-07,
      "loss": 0.0001,
      "num_tokens": 48865782.0,
      "reward": 0.7441932559013367,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7441932559013367,
      "rewards/reward_func/std": 0.0,
      "step": 1778,
      "step_time": 27.450808752328157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 218.375,
      "completions/mean_terminated_length": 218.375,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.36280398070812225,
      "epoch": 0.0823992589161649,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07675819844007492,
      "kl": 0.002921669220086187,
      "learning_rate": 9.835294117647058e-07,
      "loss": -0.0686,
      "num_tokens": 48888396.0,
      "reward": 0.446357399225235,
      "reward_std": 0.2671467959880829,
      "rewards/reward_func/mean": 0.446357399225235,
      "rewards/reward_func/std": 0.2671468257904053,
      "step": 1779,
      "step_time": 24.887188009917736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 197.5,
      "completions/mean_terminated_length": 197.5,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.38635627925395966,
      "epoch": 0.08244557665585919,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10332873463630676,
      "kl": 0.0033139188308268785,
      "learning_rate": 9.83520148216767e-07,
      "loss": -0.0341,
      "num_tokens": 48937924.0,
      "reward": 0.05635083466768265,
      "reward_std": 0.21836689114570618,
      "rewards/reward_func/mean": 0.05635083466768265,
      "rewards/reward_func/std": 0.21836690604686737,
      "step": 1780,
      "step_time": 30.988724403083324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 373.0,
      "completions/max_terminated_length": 373.0,
      "completions/mean_length": 275.0,
      "completions/mean_terminated_length": 275.0,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.27067185193300247,
      "epoch": 0.0824918943955535,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07795015722513199,
      "kl": 0.0017696596623864025,
      "learning_rate": 9.83510884668828e-07,
      "loss": -0.0698,
      "num_tokens": 48967924.0,
      "reward": 0.1412212997674942,
      "reward_std": 0.13408060371875763,
      "rewards/reward_func/mean": 0.1412212997674942,
      "rewards/reward_func/std": 0.13408060371875763,
      "step": 1781,
      "step_time": 31.076837234199047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 190.0625,
      "completions/mean_terminated_length": 190.0625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.42776962369680405,
      "epoch": 0.0825382121352478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020765860099345446,
      "kl": 0.001906985737150535,
      "learning_rate": 9.835016211208892e-07,
      "loss": 0.0001,
      "num_tokens": 49008053.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1782,
      "step_time": 22.713887855410576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 173.0,
      "completions/mean_terminated_length": 173.0,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.2731229066848755,
      "epoch": 0.08258452987494211,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010325806215405464,
      "kl": 0.0012288026919122785,
      "learning_rate": 9.834923575729503e-07,
      "loss": 0.0001,
      "num_tokens": 49041429.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 1783,
      "step_time": 20.36423819512129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 192.9375,
      "completions/mean_terminated_length": 192.9375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.30894937366247177,
      "epoch": 0.0826308476146364,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11499016731977463,
      "kl": 0.0018300899828318506,
      "learning_rate": 9.834830940250115e-07,
      "loss": 0.132,
      "num_tokens": 49063396.0,
      "reward": 0.14704498648643494,
      "reward_std": 0.08768068253993988,
      "rewards/reward_func/mean": 0.14704498648643494,
      "rewards/reward_func/std": 0.08768068999052048,
      "step": 1784,
      "step_time": 20.808938808739185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 187.125,
      "completions/mean_terminated_length": 187.125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.18482393398880959,
      "epoch": 0.08267716535433071,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17283470928668976,
      "kl": 0.0014046141877770424,
      "learning_rate": 9.834738304770726e-07,
      "loss": 0.0076,
      "num_tokens": 49085766.0,
      "reward": 0.9855483174324036,
      "reward_std": 0.0394895039498806,
      "rewards/reward_func/mean": 0.9855483174324036,
      "rewards/reward_func/std": 0.039489492774009705,
      "step": 1785,
      "step_time": 19.037325251847506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 120.125,
      "completions/mean_terminated_length": 120.125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.23211238905787468,
      "epoch": 0.08272348309402501,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001915707252919674,
      "kl": 0.0012915268598590046,
      "learning_rate": 9.834645669291337e-07,
      "loss": 0.0001,
      "num_tokens": 49105096.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1786,
      "step_time": 13.047077864408493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 188.1875,
      "completions/mean_terminated_length": 188.1875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.17376741394400597,
      "epoch": 0.08276980083371932,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019552954472601414,
      "kl": 0.001579032134031877,
      "learning_rate": 9.83455303381195e-07,
      "loss": 0.0001,
      "num_tokens": 49129147.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 1787,
      "step_time": 19.296804752200842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 156.625,
      "completions/mean_terminated_length": 156.625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.2800978422164917,
      "epoch": 0.08281611857341362,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005788644775748253,
      "kl": 0.0014217516873031855,
      "learning_rate": 9.834460398332562e-07,
      "loss": 0.0001,
      "num_tokens": 49163909.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1788,
      "step_time": 19.545057754963636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 121.3125,
      "completions/mean_terminated_length": 121.3125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.28535931557416916,
      "epoch": 0.08286243631310793,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034982829820364714,
      "kl": 0.002478871843777597,
      "learning_rate": 9.83436776285317e-07,
      "loss": 0.0001,
      "num_tokens": 49186074.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1789,
      "step_time": 14.213303998112679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 133.75,
      "completions/mean_terminated_length": 133.75,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3252570927143097,
      "epoch": 0.08290875405280222,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023588044568896294,
      "kl": 0.0019274250080343336,
      "learning_rate": 9.834275127373784e-07,
      "loss": 0.0001,
      "num_tokens": 49210326.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1790,
      "step_time": 15.675796948373318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 119.25,
      "completions/mean_terminated_length": 119.25,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2171715386211872,
      "epoch": 0.08295507179249653,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025463069323450327,
      "kl": 0.0015180335321929306,
      "learning_rate": 9.834182491894396e-07,
      "loss": 0.0001,
      "num_tokens": 49229770.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1791,
      "step_time": 14.180842258036137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 184.0,
      "completions/mean_terminated_length": 184.0,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.398625910282135,
      "epoch": 0.08300138953219083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008965880842879415,
      "kl": 0.0012904828181490302,
      "learning_rate": 9.834089856415007e-07,
      "loss": 0.0001,
      "num_tokens": 49257914.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1792,
      "step_time": 19.320599518716335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 139.5625,
      "completions/mean_terminated_length": 139.5625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3106895983219147,
      "epoch": 0.08304770727188514,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013497864129021764,
      "kl": 0.0011589547211769968,
      "learning_rate": 9.833997220935618e-07,
      "loss": 0.0001,
      "num_tokens": 49288179.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1793,
      "step_time": 16.620586711913347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 129.625,
      "completions/mean_terminated_length": 129.625,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.3716931715607643,
      "epoch": 0.08309402501157943,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017326937522739172,
      "kl": 0.0013969821447972208,
      "learning_rate": 9.83390458545623e-07,
      "loss": 0.0001,
      "num_tokens": 49309981.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1794,
      "step_time": 15.223750609904528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.0,
      "completions/max_terminated_length": 123.0,
      "completions/mean_length": 109.5625,
      "completions/mean_terminated_length": 109.5625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.23468219861388206,
      "epoch": 0.08314034275127374,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020196361001580954,
      "kl": 0.0012948633375344798,
      "learning_rate": 9.83381194997684e-07,
      "loss": 0.0001,
      "num_tokens": 49330166.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1795,
      "step_time": 12.192474193871021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 164.1875,
      "completions/mean_terminated_length": 164.1875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3503687083721161,
      "epoch": 0.08318666049096804,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12645870447158813,
      "kl": 0.002182353870011866,
      "learning_rate": 9.833719314497452e-07,
      "loss": 0.0114,
      "num_tokens": 49351945.0,
      "reward": 0.5515605807304382,
      "reward_std": 0.44124847650527954,
      "rewards/reward_func/mean": 0.5515605807304382,
      "rewards/reward_func/std": 0.44124844670295715,
      "step": 1796,
      "step_time": 16.90958084538579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 174.8125,
      "completions/mean_terminated_length": 174.8125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3956194296479225,
      "epoch": 0.08323297823066235,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001963954418897629,
      "kl": 0.0018403020512778312,
      "learning_rate": 9.833626679018063e-07,
      "loss": 0.0001,
      "num_tokens": 49383446.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1797,
      "step_time": 20.44994032010436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 315.0,
      "completions/max_terminated_length": 315.0,
      "completions/mean_length": 214.25,
      "completions/mean_terminated_length": 214.25,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.39586982131004333,
      "epoch": 0.08327929597035665,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10813822597265244,
      "kl": 0.0018154153076466173,
      "learning_rate": 9.833534043538674e-07,
      "loss": 0.0989,
      "num_tokens": 49418698.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 1798,
      "step_time": 28.728622019290924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 196.875,
      "completions/mean_terminated_length": 196.875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.42988988757133484,
      "epoch": 0.08332561371005096,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023001835215836763,
      "kl": 0.0018978688749484718,
      "learning_rate": 9.833441408059286e-07,
      "loss": 0.0001,
      "num_tokens": 49444376.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1799,
      "step_time": 20.286078292876482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 121.3125,
      "completions/mean_terminated_length": 121.3125,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.24906935542821884,
      "epoch": 0.08337193144974525,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005079444032162428,
      "kl": 0.0016205398133024573,
      "learning_rate": 9.833348772579899e-07,
      "loss": 0.0001,
      "num_tokens": 49464445.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1800,
      "step_time": 13.23555477336049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 149.875,
      "completions/mean_terminated_length": 149.875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.23199346661567688,
      "epoch": 0.08341824918943956,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012887542834505439,
      "kl": 0.0011629058717517182,
      "learning_rate": 9.833256137100508e-07,
      "loss": 0.0001,
      "num_tokens": 49491275.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 1801,
      "step_time": 17.76983129605651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 154.875,
      "completions/mean_terminated_length": 154.875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.20500538870692253,
      "epoch": 0.08346456692913386,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009976846631616354,
      "kl": 0.000767384612117894,
      "learning_rate": 9.83316350162112e-07,
      "loss": 0.0,
      "num_tokens": 49513049.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 1802,
      "step_time": 16.451257165521383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 218.875,
      "completions/mean_terminated_length": 218.875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.39179253578186035,
      "epoch": 0.08351088466882817,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10531281679868698,
      "kl": 0.0025745042366907,
      "learning_rate": 9.833070866141733e-07,
      "loss": 0.1,
      "num_tokens": 49543719.0,
      "reward": 0.6902567148208618,
      "reward_std": 0.4121498465538025,
      "rewards/reward_func/mean": 0.6902567148208618,
      "rewards/reward_func/std": 0.4121498465538025,
      "step": 1803,
      "step_time": 26.545985084027052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 165.5625,
      "completions/mean_terminated_length": 165.5625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.43202464282512665,
      "epoch": 0.08355720240852246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007357713766396046,
      "kl": 0.0014624310715589672,
      "learning_rate": 9.832978230662344e-07,
      "loss": 0.0001,
      "num_tokens": 49580416.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1804,
      "step_time": 20.909253243356943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 140.3125,
      "completions/mean_terminated_length": 140.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.27327684313058853,
      "epoch": 0.08360352014821677,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028917582239955664,
      "kl": 0.001670055149588734,
      "learning_rate": 9.832885595182955e-07,
      "loss": 0.0001,
      "num_tokens": 49600805.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1805,
      "step_time": 14.909344926476479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 191.875,
      "completions/mean_terminated_length": 191.875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.24407634884119034,
      "epoch": 0.08364983788791107,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08293452858924866,
      "kl": 0.0010697099642129615,
      "learning_rate": 9.832792959703566e-07,
      "loss": 0.0338,
      "num_tokens": 49622739.0,
      "reward": 0.9848532676696777,
      "reward_std": 0.02709529735147953,
      "rewards/reward_func/mean": 0.9848532676696777,
      "rewards/reward_func/std": 0.02709529921412468,
      "step": 1806,
      "step_time": 19.296129278838634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 196.125,
      "completions/mean_terminated_length": 196.125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.28344016522169113,
      "epoch": 0.08369615562760538,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003905265359207988,
      "kl": 0.002277866209624335,
      "learning_rate": 9.832700324224178e-07,
      "loss": 0.0001,
      "num_tokens": 49644709.0,
      "reward": 0.26359713077545166,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.26359713077545166,
      "rewards/reward_func/std": 0.0,
      "step": 1807,
      "step_time": 19.552649281919003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 167.125,
      "completions/mean_terminated_length": 167.125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.17779069393873215,
      "epoch": 0.08374247336729967,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017649864312261343,
      "kl": 0.001114297832828015,
      "learning_rate": 9.832607688744789e-07,
      "loss": 0.0001,
      "num_tokens": 49670839.0,
      "reward": 0.5468458533287048,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5468458533287048,
      "rewards/reward_func/std": 0.0,
      "step": 1808,
      "step_time": 18.213044803589582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 135.5625,
      "completions/mean_terminated_length": 135.5625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2523210905492306,
      "epoch": 0.08378879110699398,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15321528911590576,
      "kl": 0.002367090666666627,
      "learning_rate": 9.8325150532654e-07,
      "loss": -0.0064,
      "num_tokens": 49691536.0,
      "reward": 0.9750396013259888,
      "reward_std": 0.0536632239818573,
      "rewards/reward_func/mean": 0.9750396013259888,
      "rewards/reward_func/std": 0.0536632314324379,
      "step": 1809,
      "step_time": 16.44727297499776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 460.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 244.125,
      "completions/mean_terminated_length": 244.125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.43200092017650604,
      "epoch": 0.08383510884668828,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07732279598712921,
      "kl": 0.0023723530175630003,
      "learning_rate": 9.832422417786011e-07,
      "loss": -0.0417,
      "num_tokens": 49713954.0,
      "reward": 0.050845880061388016,
      "reward_std": 0.2019171267747879,
      "rewards/reward_func/mean": 0.050845880061388016,
      "rewards/reward_func/std": 0.2019171416759491,
      "step": 1810,
      "step_time": 35.61111123859882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 318.625,
      "completions/mean_terminated_length": 271.6000061035156,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "entropy": 0.17735937237739563,
      "epoch": 0.08388142658638259,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07135229557752609,
      "kl": 0.00859279406722635,
      "learning_rate": 9.832329782306623e-07,
      "loss": 0.5534,
      "num_tokens": 49746076.0,
      "reward": 0.804602324962616,
      "reward_std": 0.21490788459777832,
      "rewards/reward_func/mean": 0.804602324962616,
      "rewards/reward_func/std": 0.21490789949893951,
      "step": 1811,
      "step_time": 78.7587280869484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 193.0625,
      "completions/mean_terminated_length": 193.0625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.19942530244588852,
      "epoch": 0.08392774432607689,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008088955073617399,
      "kl": 0.0008911041804822162,
      "learning_rate": 9.832237146827234e-07,
      "loss": 0.0,
      "num_tokens": 49771997.0,
      "reward": 0.7195215821266174,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7195215821266174,
      "rewards/reward_func/std": 0.0,
      "step": 1812,
      "step_time": 18.957622949033976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 180.4375,
      "completions/mean_terminated_length": 180.4375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.21608738601207733,
      "epoch": 0.0839740620657712,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09081307053565979,
      "kl": 0.0011739842884708196,
      "learning_rate": 9.832144511347847e-07,
      "loss": 0.0007,
      "num_tokens": 49794340.0,
      "reward": 0.9288549423217773,
      "reward_std": 0.02620236761868,
      "rewards/reward_func/mean": 0.9288549423217773,
      "rewards/reward_func/std": 0.026202375069260597,
      "step": 1813,
      "step_time": 18.37841545045376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 183.1875,
      "completions/mean_terminated_length": 183.1875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.23088476434350014,
      "epoch": 0.08402037980546549,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002391039626672864,
      "kl": 0.001544103433843702,
      "learning_rate": 9.832051875868456e-07,
      "loss": 0.0001,
      "num_tokens": 49817207.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 1814,
      "step_time": 21.5418782196939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 122.6875,
      "completions/mean_terminated_length": 122.6875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.25327617675065994,
      "epoch": 0.0840666975451598,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012999364407733083,
      "kl": 0.0011234616104047745,
      "learning_rate": 9.831959240389068e-07,
      "loss": 0.0001,
      "num_tokens": 49836754.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1815,
      "step_time": 12.833026364445686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 210.4375,
      "completions/mean_terminated_length": 210.4375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.31768694519996643,
      "epoch": 0.0841130152848541,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025341857690364122,
      "kl": 0.001781856408342719,
      "learning_rate": 9.831866604909679e-07,
      "loss": 0.0001,
      "num_tokens": 49867865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1816,
      "step_time": 23.116089649498463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 130.1875,
      "completions/mean_terminated_length": 130.1875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.33896253257989883,
      "epoch": 0.08415933302454841,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016798991709947586,
      "kl": 0.001647378463530913,
      "learning_rate": 9.831773969430292e-07,
      "loss": 0.0001,
      "num_tokens": 49889596.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1817,
      "step_time": 14.299078464508057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 355.0,
      "completions/max_terminated_length": 355.0,
      "completions/mean_length": 303.9375,
      "completions/mean_terminated_length": 303.9375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "entropy": 0.24887847900390625,
      "epoch": 0.0842056507642427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05676943436264992,
      "kl": 0.0022791285591665655,
      "learning_rate": 9.831681333950904e-07,
      "loss": -0.0944,
      "num_tokens": 49930059.0,
      "reward": 0.8556371331214905,
      "reward_std": 0.33811211585998535,
      "rewards/reward_func/mean": 0.8556371331214905,
      "rewards/reward_func/std": 0.33811214566230774,
      "step": 1818,
      "step_time": 34.08150742575526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 154.875,
      "completions/mean_terminated_length": 154.875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.19956690818071365,
      "epoch": 0.08425196850393701,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015095311682671309,
      "kl": 0.0009476547274971381,
      "learning_rate": 9.831588698471515e-07,
      "loss": 0.0,
      "num_tokens": 49955289.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 1819,
      "step_time": 16.97852310538292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 136.625,
      "completions/mean_terminated_length": 136.625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.20373685285449028,
      "epoch": 0.08429828624363131,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001349732163362205,
      "kl": 0.0008416222117375582,
      "learning_rate": 9.831496062992126e-07,
      "loss": 0.0,
      "num_tokens": 49975955.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 1820,
      "step_time": 15.104047987610102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 167.0625,
      "completions/mean_terminated_length": 167.0625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.32213493436574936,
      "epoch": 0.08434460398332562,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021454528905451298,
      "kl": 0.0014302792551461607,
      "learning_rate": 9.831403427512737e-07,
      "loss": 0.0001,
      "num_tokens": 49999492.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1821,
      "step_time": 19.009277906268835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 140.3125,
      "completions/mean_terminated_length": 140.3125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2914526090025902,
      "epoch": 0.08439092172301992,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00561274541541934,
      "kl": 0.002711360401008278,
      "learning_rate": 9.831310792033349e-07,
      "loss": 0.0001,
      "num_tokens": 50022953.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1822,
      "step_time": 16.8443075530231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 174.0,
      "completions/mean_terminated_length": 174.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.2728447914123535,
      "epoch": 0.08443723946271423,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002339413855224848,
      "kl": 0.0017294083663728088,
      "learning_rate": 9.83121815655396e-07,
      "loss": 0.0001,
      "num_tokens": 50047817.0,
      "reward": 0.6339495778083801,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6339495778083801,
      "rewards/reward_func/std": 0.0,
      "step": 1823,
      "step_time": 18.795628257095814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 134.1875,
      "completions/mean_terminated_length": 134.1875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3065073862671852,
      "epoch": 0.08448355720240852,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018273607129231095,
      "kl": 0.0014974308433011174,
      "learning_rate": 9.83112552107457e-07,
      "loss": 0.0001,
      "num_tokens": 50070364.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1824,
      "step_time": 14.868289861828089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 169.5625,
      "completions/mean_terminated_length": 169.5625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.43381329625844955,
      "epoch": 0.08452987494210283,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021011296194046736,
      "kl": 0.002246720192488283,
      "learning_rate": 9.831032885595182e-07,
      "loss": 0.0001,
      "num_tokens": 50109557.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1825,
      "step_time": 22.53801593557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 119.4375,
      "completions/mean_terminated_length": 119.4375,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.28671035915613174,
      "epoch": 0.08457619268179713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020305654034018517,
      "kl": 0.0018016251851804554,
      "learning_rate": 9.830940250115794e-07,
      "loss": 0.0001,
      "num_tokens": 50130828.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1826,
      "step_time": 13.542615745216608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 132.5625,
      "completions/mean_terminated_length": 132.5625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.25577178224921227,
      "epoch": 0.08462251042149144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032878646161407232,
      "kl": 0.001347804325632751,
      "learning_rate": 9.830847614636405e-07,
      "loss": 0.0001,
      "num_tokens": 50156117.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1827,
      "step_time": 14.89745607599616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 203.5625,
      "completions/mean_terminated_length": 203.5625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4408046752214432,
      "epoch": 0.08466882816118573,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015493660466745496,
      "kl": 0.0016584981349296868,
      "learning_rate": 9.830754979157016e-07,
      "loss": 0.0001,
      "num_tokens": 50182782.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1828,
      "step_time": 22.523534949868917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 182.5,
      "completions/mean_terminated_length": 182.5,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4187343046069145,
      "epoch": 0.08471514590088004,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015243644593283534,
      "kl": 0.00160211167531088,
      "learning_rate": 9.830662343677627e-07,
      "loss": 0.0001,
      "num_tokens": 50215558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1829,
      "step_time": 22.589669562876225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 132.875,
      "completions/mean_terminated_length": 132.875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.3287733271718025,
      "epoch": 0.08476146364057434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002134723588824272,
      "kl": 0.0015161820920184255,
      "learning_rate": 9.83056970819824e-07,
      "loss": 0.0001,
      "num_tokens": 50248996.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1830,
      "step_time": 17.67187250033021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 138.25,
      "completions/mean_terminated_length": 138.25,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.21432293578982353,
      "epoch": 0.08480778138026865,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010669506154954433,
      "kl": 0.001128709947806783,
      "learning_rate": 9.830477072718852e-07,
      "loss": 0.0001,
      "num_tokens": 50274424.0,
      "reward": 0.815539538860321,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.815539538860321,
      "rewards/reward_func/std": 0.0,
      "step": 1831,
      "step_time": 15.700519874691963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 129.375,
      "completions/mean_terminated_length": 129.375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.1663511097431183,
      "epoch": 0.08485409911996294,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014463907573372126,
      "kl": 0.0009139027533819899,
      "learning_rate": 9.83038443723946e-07,
      "loss": 0.0,
      "num_tokens": 50295518.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 1832,
      "step_time": 14.905838370323181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 158.0,
      "completions/mean_terminated_length": 158.0,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.356958732008934,
      "epoch": 0.08490041685965725,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013619877863675356,
      "kl": 0.0015395651280414313,
      "learning_rate": 9.830291801760074e-07,
      "loss": 0.0001,
      "num_tokens": 50315998.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1833,
      "step_time": 16.306883040815592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.13665926083922386,
      "epoch": 0.08494673459935155,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10321447998285294,
      "kl": 0.0006671329028904438,
      "learning_rate": 9.830199166280686e-07,
      "loss": -0.0015,
      "num_tokens": 50348251.0,
      "reward": 0.8971847295761108,
      "reward_std": 0.040134936571121216,
      "rewards/reward_func/mean": 0.8971847295761108,
      "rewards/reward_func/std": 0.04013495147228241,
      "step": 1834,
      "step_time": 18.512251127511263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2695477083325386,
      "epoch": 0.08499305233904586,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001143561559729278,
      "kl": 0.0010611762409098446,
      "learning_rate": 9.830106530801297e-07,
      "loss": 0.0001,
      "num_tokens": 50370988.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1835,
      "step_time": 14.397206574678421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 194.0,
      "completions/mean_terminated_length": 194.0,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.1885693371295929,
      "epoch": 0.08503937007874016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002070471178740263,
      "kl": 0.0013506181130651385,
      "learning_rate": 9.830013895321908e-07,
      "loss": 0.0001,
      "num_tokens": 50408556.0,
      "reward": 0.9091564416885376,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9091564416885376,
      "rewards/reward_func/std": 0.0,
      "step": 1836,
      "step_time": 23.369936358183622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 115.6875,
      "completions/mean_terminated_length": 115.6875,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.20968084037303925,
      "epoch": 0.08508568781843447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002231639577075839,
      "kl": 0.001264735299628228,
      "learning_rate": 9.82992125984252e-07,
      "loss": 0.0001,
      "num_tokens": 50428055.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1837,
      "step_time": 12.542578887194395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 127.75,
      "completions/mean_terminated_length": 127.75,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.27801789343357086,
      "epoch": 0.08513200555812876,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013946460094302893,
      "kl": 0.0012275190965738147,
      "learning_rate": 9.82982862436313e-07,
      "loss": 0.0001,
      "num_tokens": 50450131.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1838,
      "step_time": 14.32071528956294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 205.5625,
      "completions/mean_terminated_length": 205.5625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.2687208354473114,
      "epoch": 0.08517832329782307,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024656786117702723,
      "kl": 0.0018901612784247845,
      "learning_rate": 9.829735988883742e-07,
      "loss": 0.0001,
      "num_tokens": 50471852.0,
      "reward": 0.9574533700942993,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9574533700942993,
      "rewards/reward_func/std": 0.0,
      "step": 1839,
      "step_time": 19.37217130884528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 261.625,
      "completions/mean_terminated_length": 261.625,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "entropy": 0.28042303770780563,
      "epoch": 0.08522464103751737,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001550883986055851,
      "kl": 0.001388590142596513,
      "learning_rate": 9.829643353404353e-07,
      "loss": 0.0001,
      "num_tokens": 50500022.0,
      "reward": 0.8511605262756348,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8511605262756348,
      "rewards/reward_func/std": 0.0,
      "step": 1840,
      "step_time": 26.087206903845072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 165.625,
      "completions/mean_terminated_length": 165.625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.40595898032188416,
      "epoch": 0.08527095877721168,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12976838648319244,
      "kl": 0.0017887320136651397,
      "learning_rate": 9.829550717924964e-07,
      "loss": -0.1958,
      "num_tokens": 50535424.0,
      "reward": 0.014202741906046867,
      "reward_std": 0.03880927711725235,
      "rewards/reward_func/mean": 0.014202741906046867,
      "rewards/reward_func/std": 0.03880928084254265,
      "step": 1841,
      "step_time": 25.53868832066655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 448.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 346.0,
      "completions/mean_terminated_length": 346.0,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "entropy": 0.172083992511034,
      "epoch": 0.08531727651690597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006974312709644437,
      "kl": 0.0009179186017718166,
      "learning_rate": 9.829458082445576e-07,
      "loss": 0.0,
      "num_tokens": 50564816.0,
      "reward": 0.9813933372497559,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9813933372497559,
      "rewards/reward_func/std": 0.0,
      "step": 1842,
      "step_time": 36.786931216716766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 156.125,
      "completions/mean_terminated_length": 156.125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.19916684553027153,
      "epoch": 0.08536359425660028,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010334254475310445,
      "kl": 0.0009760814864421263,
      "learning_rate": 9.82936544696619e-07,
      "loss": 0.0,
      "num_tokens": 50587026.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1843,
      "step_time": 16.807015921920538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 174.875,
      "completions/mean_terminated_length": 174.875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.4466772973537445,
      "epoch": 0.08540991199629458,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001840939512476325,
      "kl": 0.0020238936704117805,
      "learning_rate": 9.8292728114868e-07,
      "loss": 0.0001,
      "num_tokens": 50609648.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1844,
      "step_time": 18.286206517368555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 125.9375,
      "completions/mean_terminated_length": 125.9375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.276754766702652,
      "epoch": 0.08545622973598889,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015949427615851164,
      "kl": 0.00127955008065328,
      "learning_rate": 9.82918017600741e-07,
      "loss": 0.0001,
      "num_tokens": 50630399.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1845,
      "step_time": 14.9763044975698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 137.1875,
      "completions/mean_terminated_length": 137.1875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3048417717218399,
      "epoch": 0.08550254747568319,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011707213707268238,
      "kl": 0.0013620333629660308,
      "learning_rate": 9.82908754052802e-07,
      "loss": 0.0001,
      "num_tokens": 50652818.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1846,
      "step_time": 15.950649853795767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 156.875,
      "completions/mean_terminated_length": 156.875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3699372261762619,
      "epoch": 0.0855488652153775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014884648844599724,
      "kl": 0.001510350644821301,
      "learning_rate": 9.828994905048634e-07,
      "loss": 0.0001,
      "num_tokens": 50696720.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1847,
      "step_time": 22.3307176977396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 128.0625,
      "completions/mean_terminated_length": 128.0625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2906503230333328,
      "epoch": 0.08559518295507179,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007628771010786295,
      "kl": 0.0023074908240232617,
      "learning_rate": 9.828902269569245e-07,
      "loss": 0.0001,
      "num_tokens": 50716273.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1848,
      "step_time": 13.887910101562738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 115.0625,
      "completions/mean_terminated_length": 115.0625,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.27160170674324036,
      "epoch": 0.0856415006947661,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013438882306218147,
      "kl": 0.001252780290087685,
      "learning_rate": 9.828809634089856e-07,
      "loss": 0.0001,
      "num_tokens": 50737218.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1849,
      "step_time": 13.047743078321218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 191.3125,
      "completions/mean_terminated_length": 191.3125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4259258583188057,
      "epoch": 0.0856878184344604,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011495755752548575,
      "kl": 0.001411606790497899,
      "learning_rate": 9.828716998610468e-07,
      "loss": 0.0001,
      "num_tokens": 50767639.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1850,
      "step_time": 23.015502981841564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3776135966181755,
      "epoch": 0.08573413617415471,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032343179918825626,
      "kl": 0.0027543975738808513,
      "learning_rate": 9.82862436313108e-07,
      "loss": 0.0001,
      "num_tokens": 50821786.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1851,
      "step_time": 23.467766117304564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 154.5625,
      "completions/mean_terminated_length": 154.5625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3044525012373924,
      "epoch": 0.085780453913849,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10032334923744202,
      "kl": 0.002875492617022246,
      "learning_rate": 9.82853172765169e-07,
      "loss": 0.0337,
      "num_tokens": 50845203.0,
      "reward": 0.01100965216755867,
      "reward_std": 0.002935907104983926,
      "rewards/reward_func/mean": 0.01100965216755867,
      "rewards/reward_func/std": 0.002935907104983926,
      "step": 1852,
      "step_time": 16.856601383537054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 200.75,
      "completions/mean_terminated_length": 200.75,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3908821791410446,
      "epoch": 0.08582677165354331,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06613165140151978,
      "kl": 0.0034208440338261425,
      "learning_rate": 9.828439092172301e-07,
      "loss": 0.0073,
      "num_tokens": 50872767.0,
      "reward": 0.058713316917419434,
      "reward_std": 0.23485326766967773,
      "rewards/reward_func/mean": 0.058713316917419434,
      "rewards/reward_func/std": 0.23485328257083893,
      "step": 1853,
      "step_time": 22.533325608819723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 165.5,
      "completions/mean_terminated_length": 165.5,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.32102739065885544,
      "epoch": 0.08587308939323761,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10941273719072342,
      "kl": 0.0024127698270604014,
      "learning_rate": 9.828346456692913e-07,
      "loss": 0.0219,
      "num_tokens": 50895383.0,
      "reward": 0.8727484941482544,
      "reward_std": 0.23336170613765717,
      "rewards/reward_func/mean": 0.8727484941482544,
      "rewards/reward_func/std": 0.23336170613765717,
      "step": 1854,
      "step_time": 18.159001354128122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 151.9375,
      "completions/mean_terminated_length": 151.9375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.39959168434143066,
      "epoch": 0.08591940713293192,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016449117101728916,
      "kl": 0.0015864612068980932,
      "learning_rate": 9.828253821213524e-07,
      "loss": 0.0001,
      "num_tokens": 50926134.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1855,
      "step_time": 19.35863560438156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 133.375,
      "completions/mean_terminated_length": 133.375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.3179458901286125,
      "epoch": 0.08596572487262621,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001405479502864182,
      "kl": 0.0012443141895346344,
      "learning_rate": 9.828161185734135e-07,
      "loss": 0.0001,
      "num_tokens": 50949100.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1856,
      "step_time": 15.826786436140537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 124.6875,
      "completions/mean_terminated_length": 124.6875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3403223231434822,
      "epoch": 0.08601204261232052,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024394665379077196,
      "kl": 0.0016991851734928787,
      "learning_rate": 9.828068550254746e-07,
      "loss": 0.0001,
      "num_tokens": 50982167.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1857,
      "step_time": 17.94947485253215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 169.75,
      "completions/mean_terminated_length": 169.75,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.397172674536705,
      "epoch": 0.08605836035201482,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017737135058268905,
      "kl": 0.0019284598529338837,
      "learning_rate": 9.827975914775358e-07,
      "loss": 0.0001,
      "num_tokens": 51023523.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1858,
      "step_time": 22.791668850928545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 185.125,
      "completions/mean_terminated_length": 185.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.3660402297973633,
      "epoch": 0.08610467809170913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00417954009026289,
      "kl": 0.002777126559522003,
      "learning_rate": 9.82788327929597e-07,
      "loss": 0.0001,
      "num_tokens": 51054341.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1859,
      "step_time": 22.992428425699472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 178.875,
      "completions/mean_terminated_length": 178.875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.35621071606874466,
      "epoch": 0.08615099583140343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002203872427344322,
      "kl": 0.0016685849404893816,
      "learning_rate": 9.827790643816582e-07,
      "loss": 0.0001,
      "num_tokens": 51091475.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1860,
      "step_time": 22.436206620186567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 162.875,
      "completions/mean_terminated_length": 162.875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.17797373235225677,
      "epoch": 0.08619731357109774,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014392499579116702,
      "kl": 0.0008662991021992639,
      "learning_rate": 9.827698008337194e-07,
      "loss": 0.0,
      "num_tokens": 51113137.0,
      "reward": 0.8464817404747009,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8464817404747009,
      "rewards/reward_func/std": 0.0,
      "step": 1861,
      "step_time": 17.213813357055187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 154.1875,
      "completions/mean_terminated_length": 154.1875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.37707026302814484,
      "epoch": 0.08624363131079203,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014551484491676092,
      "kl": 0.0017600449791643769,
      "learning_rate": 9.827605372857805e-07,
      "loss": 0.0001,
      "num_tokens": 51147428.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1862,
      "step_time": 19.349631626158953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 119.4375,
      "completions/mean_terminated_length": 119.4375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.27240651845932007,
      "epoch": 0.08628994905048634,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020565385930240154,
      "kl": 0.0012519661395344883,
      "learning_rate": 9.827512737378416e-07,
      "loss": 0.0001,
      "num_tokens": 51169275.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1863,
      "step_time": 13.438467107713223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 145.5625,
      "completions/mean_terminated_length": 145.5625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.24237633123993874,
      "epoch": 0.08633626679018064,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011182187590748072,
      "kl": 0.00101482545142062,
      "learning_rate": 9.827420101899027e-07,
      "loss": 0.0001,
      "num_tokens": 51189268.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1864,
      "step_time": 15.13896419852972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 119.125,
      "completions/mean_terminated_length": 119.125,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2867928221821785,
      "epoch": 0.08638258452987495,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015448889462277293,
      "kl": 0.0012244154931977391,
      "learning_rate": 9.827327466419639e-07,
      "loss": 0.0001,
      "num_tokens": 51209190.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1865,
      "step_time": 12.89440918713808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 149.5625,
      "completions/mean_terminated_length": 149.5625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.4366995394229889,
      "epoch": 0.08642890226956924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009921262972056866,
      "kl": 0.0016550397267565131,
      "learning_rate": 9.82723483094025e-07,
      "loss": 0.0001,
      "num_tokens": 51267759.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1866,
      "step_time": 25.876544449478388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 114.875,
      "completions/mean_terminated_length": 114.875,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2834022119641304,
      "epoch": 0.08647522000926355,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001233416609466076,
      "kl": 0.0010649955947883427,
      "learning_rate": 9.827142195460861e-07,
      "loss": 0.0001,
      "num_tokens": 51288109.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1867,
      "step_time": 12.502612922340631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 226.25,
      "completions/mean_terminated_length": 226.25,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.3381047397851944,
      "epoch": 0.08652153774895785,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08101329952478409,
      "kl": 0.00206389746745117,
      "learning_rate": 9.827049559981472e-07,
      "loss": 0.0059,
      "num_tokens": 51326033.0,
      "reward": 0.6728014945983887,
      "reward_std": 0.3186631500720978,
      "rewards/reward_func/mean": 0.6728014945983887,
      "rewards/reward_func/std": 0.3186631500720978,
      "step": 1868,
      "step_time": 25.663867883384228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 284.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 247.125,
      "completions/mean_terminated_length": 247.125,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "entropy": 0.2553657218813896,
      "epoch": 0.08656785548865216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012497804127633572,
      "kl": 0.0013065116945654154,
      "learning_rate": 9.826956924502084e-07,
      "loss": 0.0001,
      "num_tokens": 51356083.0,
      "reward": 0.9381646513938904,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9381646513938904,
      "rewards/reward_func/std": 0.0,
      "step": 1869,
      "step_time": 25.252639766782522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 151.0,
      "completions/mean_terminated_length": 151.0,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.4294079840183258,
      "epoch": 0.08661417322834646,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015133166452869773,
      "kl": 0.0019314551609568298,
      "learning_rate": 9.826864289022695e-07,
      "loss": 0.0001,
      "num_tokens": 51400691.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1870,
      "step_time": 21.71081743016839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 343.0,
      "completions/max_terminated_length": 343.0,
      "completions/mean_length": 244.1875,
      "completions/mean_terminated_length": 244.1875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.368765652179718,
      "epoch": 0.08666049096804077,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08822564780712128,
      "kl": 0.00198890981846489,
      "learning_rate": 9.826771653543306e-07,
      "loss": -0.1854,
      "num_tokens": 51440966.0,
      "reward": 0.12633299827575684,
      "reward_std": 0.22599133849143982,
      "rewards/reward_func/mean": 0.12633299827575684,
      "rewards/reward_func/std": 0.22599133849143982,
      "step": 1871,
      "step_time": 31.792429622262716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 191.0625,
      "completions/mean_terminated_length": 191.0625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.46957267075777054,
      "epoch": 0.08670680870773506,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003613095497712493,
      "kl": 0.00231377431191504,
      "learning_rate": 9.826679018063917e-07,
      "loss": 0.0001,
      "num_tokens": 51467159.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1872,
      "step_time": 21.5230940207839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 205.125,
      "completions/mean_terminated_length": 205.125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.15072263777256012,
      "epoch": 0.08675312644742937,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06673221290111542,
      "kl": 0.0023850490106269717,
      "learning_rate": 9.82658638258453e-07,
      "loss": -0.0581,
      "num_tokens": 51498025.0,
      "reward": 0.9263890385627747,
      "reward_std": 0.20114344358444214,
      "rewards/reward_func/mean": 0.9263890385627747,
      "rewards/reward_func/std": 0.20114347338676453,
      "step": 1873,
      "step_time": 21.987693183124065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 155.0625,
      "completions/mean_terminated_length": 155.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3341098949313164,
      "epoch": 0.08679944418712367,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017773177241906524,
      "kl": 0.001814171380829066,
      "learning_rate": 9.826493747105142e-07,
      "loss": 0.0001,
      "num_tokens": 51524970.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1874,
      "step_time": 17.683572709560394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 195.6875,
      "completions/mean_terminated_length": 195.6875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.18182585015892982,
      "epoch": 0.08684576192681798,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016797479474917054,
      "kl": 0.0011587960179895163,
      "learning_rate": 9.82640111162575e-07,
      "loss": 0.0001,
      "num_tokens": 51549061.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1875,
      "step_time": 19.15440022945404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 141.9375,
      "completions/mean_terminated_length": 141.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.39947322756052017,
      "epoch": 0.08689207966651227,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001396923209540546,
      "kl": 0.0016080295317806304,
      "learning_rate": 9.826308476146362e-07,
      "loss": 0.0001,
      "num_tokens": 51591284.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1876,
      "step_time": 19.90982525423169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 174.5625,
      "completions/mean_terminated_length": 174.5625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3752344325184822,
      "epoch": 0.08693839740620658,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001198168029077351,
      "kl": 0.0013058011536486447,
      "learning_rate": 9.826215840666976e-07,
      "loss": 0.0001,
      "num_tokens": 51629037.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1877,
      "step_time": 22.050659473985434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 177.5,
      "completions/mean_terminated_length": 177.5,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3675825148820877,
      "epoch": 0.08698471514590088,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015190679114311934,
      "kl": 0.0016339565336238593,
      "learning_rate": 9.826123205187587e-07,
      "loss": 0.0001,
      "num_tokens": 51679925.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1878,
      "step_time": 24.92166867107153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 174.25,
      "completions/mean_terminated_length": 174.25,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.28735554218292236,
      "epoch": 0.08703103288559519,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11670393496751785,
      "kl": 0.0024552375252824277,
      "learning_rate": 9.826030569708198e-07,
      "loss": -0.0396,
      "num_tokens": 51704297.0,
      "reward": 0.6030696630477905,
      "reward_std": 0.3176274597644806,
      "rewards/reward_func/mean": 0.6030696630477905,
      "rewards/reward_func/std": 0.3176274597644806,
      "step": 1879,
      "step_time": 18.444391392171383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 117.5625,
      "completions/mean_terminated_length": 117.5625,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.3189704567193985,
      "epoch": 0.08707735062528948,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002227914985269308,
      "kl": 0.002039935643551871,
      "learning_rate": 9.82593793422881e-07,
      "loss": 0.0001,
      "num_tokens": 51726002.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1880,
      "step_time": 13.512067291885614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 130.75,
      "completions/mean_terminated_length": 130.75,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.23120297119021416,
      "epoch": 0.0871236683649838,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009425807511433959,
      "kl": 0.001209792186273262,
      "learning_rate": 9.82584529874942e-07,
      "loss": 0.0001,
      "num_tokens": 51745678.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1881,
      "step_time": 14.72817013412714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 211.0,
      "completions/mean_terminated_length": 211.0,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.1960393637418747,
      "epoch": 0.08716998610467809,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08713279664516449,
      "kl": 0.0014232159592211246,
      "learning_rate": 9.825752663270032e-07,
      "loss": 0.009,
      "num_tokens": 51771950.0,
      "reward": 0.9066835641860962,
      "reward_std": 0.11560893803834915,
      "rewards/reward_func/mean": 0.9066835641860962,
      "rewards/reward_func/std": 0.11560893803834915,
      "step": 1882,
      "step_time": 21.761763382703066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 312.0,
      "completions/max_terminated_length": 312.0,
      "completions/mean_length": 225.75,
      "completions/mean_terminated_length": 225.75,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.18618855625391006,
      "epoch": 0.0872163038443724,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08309055864810944,
      "kl": 0.0009894871473079547,
      "learning_rate": 9.825660027790643e-07,
      "loss": -0.0953,
      "num_tokens": 51797306.0,
      "reward": 0.5495471954345703,
      "reward_std": 0.07476285099983215,
      "rewards/reward_func/mean": 0.5495471954345703,
      "rewards/reward_func/std": 0.07476283609867096,
      "step": 1883,
      "step_time": 27.02977531775832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 182.75,
      "completions/mean_terminated_length": 182.75,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2454192265868187,
      "epoch": 0.0872626215840667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016377604333683848,
      "kl": 0.001226855645654723,
      "learning_rate": 9.825567392311254e-07,
      "loss": 0.0001,
      "num_tokens": 51822870.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1884,
      "step_time": 19.167727533727884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 193.875,
      "completions/mean_terminated_length": 193.875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.2673642858862877,
      "epoch": 0.087308939323761,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08692874759435654,
      "kl": 0.0021311097079887986,
      "learning_rate": 9.825474756831866e-07,
      "loss": 0.0604,
      "num_tokens": 51846660.0,
      "reward": 0.8378630876541138,
      "reward_std": 0.16086310148239136,
      "rewards/reward_func/mean": 0.8378630876541138,
      "rewards/reward_func/std": 0.16086310148239136,
      "step": 1885,
      "step_time": 22.522730112075806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 190.3125,
      "completions/mean_terminated_length": 190.3125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.38384315371513367,
      "epoch": 0.0873552570634553,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08401807397603989,
      "kl": 0.0019377712160348892,
      "learning_rate": 9.825382121352477e-07,
      "loss": -0.0218,
      "num_tokens": 51879145.0,
      "reward": 0.0322435162961483,
      "reward_std": 0.1289740651845932,
      "rewards/reward_func/mean": 0.0322435162961483,
      "rewards/reward_func/std": 0.1289740651845932,
      "step": 1886,
      "step_time": 21.635551754385233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 174.4375,
      "completions/mean_terminated_length": 174.4375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3883207440376282,
      "epoch": 0.08740157480314961,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027886752504855394,
      "kl": 0.001899563183542341,
      "learning_rate": 9.82528948587309e-07,
      "loss": 0.0001,
      "num_tokens": 51913808.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1887,
      "step_time": 21.102830704301596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 138.875,
      "completions/mean_terminated_length": 138.875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.288997083902359,
      "epoch": 0.08744789254284391,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002589646028354764,
      "kl": 0.0016991238808259368,
      "learning_rate": 9.8251968503937e-07,
      "loss": 0.0001,
      "num_tokens": 51936622.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1888,
      "step_time": 16.819790691137314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 138.875,
      "completions/mean_terminated_length": 138.875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2332070954144001,
      "epoch": 0.08749421028253822,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015839972766116261,
      "kl": 0.0013473039434757084,
      "learning_rate": 9.82510421491431e-07,
      "loss": 0.0001,
      "num_tokens": 51956380.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1889,
      "step_time": 15.471843719482422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 114.5,
      "completions/mean_terminated_length": 114.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.25283853337168694,
      "epoch": 0.08754052802223251,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002629344817250967,
      "kl": 0.0012974267883691937,
      "learning_rate": 9.825011579434924e-07,
      "loss": 0.0001,
      "num_tokens": 51975812.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1890,
      "step_time": 12.317603267729282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 293.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 224.25,
      "completions/mean_terminated_length": 224.25,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.30954907089471817,
      "epoch": 0.08758684576192682,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09644221514463425,
      "kl": 0.001653652056120336,
      "learning_rate": 9.824918943955535e-07,
      "loss": 0.0767,
      "num_tokens": 52005224.0,
      "reward": 0.913185715675354,
      "reward_std": 0.18401484191417694,
      "rewards/reward_func/mean": 0.913185715675354,
      "rewards/reward_func/std": 0.18401482701301575,
      "step": 1891,
      "step_time": 26.022328063845634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 221.4375,
      "completions/mean_terminated_length": 221.4375,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "entropy": 0.16434165090322495,
      "epoch": 0.08763316350162112,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004602053668349981,
      "kl": 0.00152591013466008,
      "learning_rate": 9.824826308476147e-07,
      "loss": 0.0001,
      "num_tokens": 52043983.0,
      "reward": 0.9793821573257446,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9793821573257446,
      "rewards/reward_func/std": 0.0,
      "step": 1892,
      "step_time": 24.3629249073565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 171.4375,
      "completions/mean_terminated_length": 171.4375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.20935287326574326,
      "epoch": 0.08767948124131543,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.089290551841259,
      "kl": 0.0008470645552733913,
      "learning_rate": 9.824733672996758e-07,
      "loss": 0.0032,
      "num_tokens": 52088694.0,
      "reward": 0.9019652009010315,
      "reward_std": 0.06826267391443253,
      "rewards/reward_func/mean": 0.9019652009010315,
      "rewards/reward_func/std": 0.06826266646385193,
      "step": 1893,
      "step_time": 22.164428021758795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 212.25,
      "completions/mean_terminated_length": 212.25,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.4360707327723503,
      "epoch": 0.08772579898100973,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016040436457842588,
      "kl": 0.0023941476247273386,
      "learning_rate": 9.82464103751737e-07,
      "loss": 0.0001,
      "num_tokens": 52137178.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1894,
      "step_time": 26.880092922598124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 129.625,
      "completions/mean_terminated_length": 129.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.30159368366003036,
      "epoch": 0.08777211672070404,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015431154752150178,
      "kl": 0.0015250979049596936,
      "learning_rate": 9.82454840203798e-07,
      "loss": 0.0001,
      "num_tokens": 52157956.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1895,
      "step_time": 15.323430735617876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 297.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 223.0,
      "completions/mean_terminated_length": 223.0,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.35118141025304794,
      "epoch": 0.08781843446039833,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0921139344573021,
      "kl": 0.002168814418837428,
      "learning_rate": 9.824455766558592e-07,
      "loss": -0.0817,
      "num_tokens": 52182068.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 1896,
      "step_time": 25.05549620091915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 192.9375,
      "completions/mean_terminated_length": 192.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.4070845916867256,
      "epoch": 0.08786475220009264,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10707610100507736,
      "kl": 0.0032461085938848555,
      "learning_rate": 9.824363131079203e-07,
      "loss": 0.0357,
      "num_tokens": 52204435.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 1897,
      "step_time": 19.98173614963889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 159.4375,
      "completions/mean_terminated_length": 159.4375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.26376891881227493,
      "epoch": 0.08791106993978694,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18185175955295563,
      "kl": 0.0015802593261469156,
      "learning_rate": 9.824270495599814e-07,
      "loss": -0.0206,
      "num_tokens": 52227754.0,
      "reward": 0.9534019827842712,
      "reward_std": 0.10018271207809448,
      "rewards/reward_func/mean": 0.9534019827842712,
      "rewards/reward_func/std": 0.10018270462751389,
      "step": 1898,
      "step_time": 18.17505842074752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 329.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 248.1875,
      "completions/mean_terminated_length": 248.1875,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.24741164222359657,
      "epoch": 0.08795738767948125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10063128918409348,
      "kl": 0.001562878693221137,
      "learning_rate": 9.824177860120425e-07,
      "loss": -0.1509,
      "num_tokens": 52267965.0,
      "reward": 0.3122842609882355,
      "reward_std": 0.25194114446640015,
      "rewards/reward_func/mean": 0.3122842609882355,
      "rewards/reward_func/std": 0.25194114446640015,
      "step": 1899,
      "step_time": 30.92778167501092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 288.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 170.4375,
      "completions/mean_terminated_length": 170.4375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.36519360542297363,
      "epoch": 0.08800370541917554,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1244279220700264,
      "kl": 0.003008928440976888,
      "learning_rate": 9.824085224641037e-07,
      "loss": -0.1722,
      "num_tokens": 52290932.0,
      "reward": 0.051292113959789276,
      "reward_std": 0.2051684558391571,
      "rewards/reward_func/mean": 0.051292113959789276,
      "rewards/reward_func/std": 0.2051684558391571,
      "step": 1900,
      "step_time": 24.453903168439865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 131.4375,
      "completions/mean_terminated_length": 131.4375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.24329349771142006,
      "epoch": 0.08805002315886985,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010272827930748463,
      "kl": 0.0010608525190036744,
      "learning_rate": 9.823992589161648e-07,
      "loss": 0.0001,
      "num_tokens": 52314043.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1901,
      "step_time": 14.853858038783073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 135.0625,
      "completions/mean_terminated_length": 135.0625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.31072814017534256,
      "epoch": 0.08809634089856415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011184071190655231,
      "kl": 0.0013472106656990945,
      "learning_rate": 9.82389995368226e-07,
      "loss": 0.0001,
      "num_tokens": 52339868.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1902,
      "step_time": 16.587781220674515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 248.1875,
      "completions/mean_terminated_length": 248.1875,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.22742655128240585,
      "epoch": 0.08814265863825846,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039713154546916485,
      "kl": 0.0021700568904634565,
      "learning_rate": 9.823807318202872e-07,
      "loss": 0.0001,
      "num_tokens": 52363871.0,
      "reward": 0.8531438708305359,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8531438708305359,
      "rewards/reward_func/std": 0.0,
      "step": 1903,
      "step_time": 23.454675372689962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 148.875,
      "completions/mean_terminated_length": 148.875,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.32779307663440704,
      "epoch": 0.08818897637795275,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012335549108684063,
      "kl": 0.001041513794916682,
      "learning_rate": 9.823714682723484e-07,
      "loss": 0.0001,
      "num_tokens": 52399965.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1904,
      "step_time": 19.586184982210398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 192.6875,
      "completions/mean_terminated_length": 192.6875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.40551455318927765,
      "epoch": 0.08823529411764706,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022247496526688337,
      "kl": 0.00166975031606853,
      "learning_rate": 9.823622047244095e-07,
      "loss": 0.0001,
      "num_tokens": 52428360.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1905,
      "step_time": 21.36198526248336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 189.9375,
      "completions/mean_terminated_length": 189.9375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3530169874429703,
      "epoch": 0.08828161185734136,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002043097745627165,
      "kl": 0.0015477327397093177,
      "learning_rate": 9.823529411764704e-07,
      "loss": 0.0001,
      "num_tokens": 52457927.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1906,
      "step_time": 20.78644258901477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 135.0,
      "completions/mean_terminated_length": 135.0,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27532320097088814,
      "epoch": 0.08832792959703567,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013013739138841629,
      "kl": 0.0031569500570185483,
      "learning_rate": 9.823436776285317e-07,
      "loss": 0.0002,
      "num_tokens": 52478743.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1907,
      "step_time": 15.326287671923637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 161.3125,
      "completions/mean_terminated_length": 161.3125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3997081443667412,
      "epoch": 0.08837424733672997,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022086037788540125,
      "kl": 0.0018891174113377929,
      "learning_rate": 9.823344140805929e-07,
      "loss": 0.0001,
      "num_tokens": 52501484.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1908,
      "step_time": 17.333568029105663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 137.75,
      "completions/mean_terminated_length": 137.75,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2683362513780594,
      "epoch": 0.08842056507642428,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005174525082111359,
      "kl": 0.0018115075945388526,
      "learning_rate": 9.82325150532654e-07,
      "loss": 0.0001,
      "num_tokens": 52521368.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1909,
      "step_time": 15.31071873754263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 176.6875,
      "completions/mean_terminated_length": 176.6875,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.2102694883942604,
      "epoch": 0.08846688281611857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028989873826503754,
      "kl": 0.0015588774112984538,
      "learning_rate": 9.823158869847151e-07,
      "loss": 0.0001,
      "num_tokens": 52550115.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1910,
      "step_time": 19.70367395877838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 114.5,
      "completions/mean_terminated_length": 114.5,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.24982880800962448,
      "epoch": 0.08851320055581288,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001907306956127286,
      "kl": 0.0017034209740813822,
      "learning_rate": 9.823066234367762e-07,
      "loss": 0.0001,
      "num_tokens": 52569371.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1911,
      "step_time": 13.268001470714808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 154.9375,
      "completions/mean_terminated_length": 154.9375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.17168958857655525,
      "epoch": 0.08855951829550718,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12762132287025452,
      "kl": 0.006979975383728743,
      "learning_rate": 9.822973598888374e-07,
      "loss": -0.0495,
      "num_tokens": 52589898.0,
      "reward": 0.597527027130127,
      "reward_std": 0.2399885058403015,
      "rewards/reward_func/mean": 0.597527027130127,
      "rewards/reward_func/std": 0.2399885058403015,
      "step": 1912,
      "step_time": 15.9775386787951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 203.5625,
      "completions/mean_terminated_length": 203.5625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.19557374715805054,
      "epoch": 0.08860583603520149,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001692920457571745,
      "kl": 0.0012704057735390961,
      "learning_rate": 9.822880963408985e-07,
      "loss": 0.0001,
      "num_tokens": 52613491.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1913,
      "step_time": 20.589644316583872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 166.3125,
      "completions/mean_terminated_length": 166.3125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3764125630259514,
      "epoch": 0.08865215377489578,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025116396136581898,
      "kl": 0.0018544631893746555,
      "learning_rate": 9.822788327929596e-07,
      "loss": 0.0001,
      "num_tokens": 52650296.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1914,
      "step_time": 21.7999594733119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.22333737090229988,
      "epoch": 0.0886984715145901,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09571599215269089,
      "kl": 0.00227964308578521,
      "learning_rate": 9.822695692450207e-07,
      "loss": 0.0318,
      "num_tokens": 52671096.0,
      "reward": 0.9386385679244995,
      "reward_std": 0.06337377429008484,
      "rewards/reward_func/mean": 0.9386385679244995,
      "rewards/reward_func/std": 0.06337378919124603,
      "step": 1915,
      "step_time": 18.661055110394955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 182.25,
      "completions/mean_terminated_length": 182.25,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.22838079929351807,
      "epoch": 0.08874478925428439,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000901050167158246,
      "kl": 0.001069504301995039,
      "learning_rate": 9.822603056970819e-07,
      "loss": 0.0001,
      "num_tokens": 52692876.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 1916,
      "step_time": 19.146745320409536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 141.5625,
      "completions/mean_terminated_length": 141.5625,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.35518868267536163,
      "epoch": 0.0887911069939787,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001423563458956778,
      "kl": 0.0015517770953010768,
      "learning_rate": 9.822510421491432e-07,
      "loss": 0.0001,
      "num_tokens": 52726997.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1917,
      "step_time": 18.057892087846994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 132.4375,
      "completions/mean_terminated_length": 132.4375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.24251199886202812,
      "epoch": 0.088837424733673,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012752824695780873,
      "kl": 0.0012560687900986522,
      "learning_rate": 9.822417786012041e-07,
      "loss": 0.0001,
      "num_tokens": 52748476.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1918,
      "step_time": 15.007533088326454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 113.625,
      "completions/mean_terminated_length": 113.625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2651001140475273,
      "epoch": 0.0888837424733673,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037908037193119526,
      "kl": 0.002068093483103439,
      "learning_rate": 9.822325150532652e-07,
      "loss": 0.0001,
      "num_tokens": 52767878.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1919,
      "step_time": 12.267887149006128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 175.875,
      "completions/mean_terminated_length": 175.875,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.19665367901325226,
      "epoch": 0.0889300602130616,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1086895689368248,
      "kl": 0.002230938378488645,
      "learning_rate": 9.822232515053266e-07,
      "loss": -0.0204,
      "num_tokens": 52790884.0,
      "reward": 0.9019302129745483,
      "reward_std": 0.03052354045212269,
      "rewards/reward_func/mean": 0.9019302129745483,
      "rewards/reward_func/std": 0.03052353300154209,
      "step": 1920,
      "step_time": 17.681687232106924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 181.0,
      "completions/mean_terminated_length": 181.0,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.410856731235981,
      "epoch": 0.08897637795275591,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00221324828453362,
      "kl": 0.0020306374062784016,
      "learning_rate": 9.822139879573877e-07,
      "loss": 0.0001,
      "num_tokens": 52812244.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1921,
      "step_time": 18.97685321420431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 179.125,
      "completions/mean_terminated_length": 179.125,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3178200200200081,
      "epoch": 0.08902269569245021,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18764638900756836,
      "kl": 0.002159391442546621,
      "learning_rate": 9.822047244094488e-07,
      "loss": -0.0269,
      "num_tokens": 52840230.0,
      "reward": 0.050670865923166275,
      "reward_std": 0.0035351975820958614,
      "rewards/reward_func/mean": 0.050670865923166275,
      "rewards/reward_func/std": 0.0035351980477571487,
      "step": 1922,
      "step_time": 19.952056918293238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 352.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 203.5625,
      "completions/mean_terminated_length": 203.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3301963210105896,
      "epoch": 0.08906901343214452,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11845042556524277,
      "kl": 0.003240281599573791,
      "learning_rate": 9.8219546086151e-07,
      "loss": -0.2876,
      "num_tokens": 52863231.0,
      "reward": 0.011592301540076733,
      "reward_std": 0.024922698736190796,
      "rewards/reward_func/mean": 0.011592301540076733,
      "rewards/reward_func/std": 0.024922700598835945,
      "step": 1923,
      "step_time": 28.92820466682315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 170.625,
      "completions/mean_terminated_length": 170.625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.42731931060552597,
      "epoch": 0.08911533117183881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002207790268585086,
      "kl": 0.0017887434223666787,
      "learning_rate": 9.82186197313571e-07,
      "loss": 0.0001,
      "num_tokens": 52884665.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1924,
      "step_time": 18.699849113821983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 356.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 337.375,
      "completions/mean_terminated_length": 337.375,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "entropy": 0.14161300286650658,
      "epoch": 0.08916164891153312,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05392554774880409,
      "kl": 0.0006552610284416005,
      "learning_rate": 9.821769337656322e-07,
      "loss": 0.0032,
      "num_tokens": 52927935.0,
      "reward": 0.28361839056015015,
      "reward_std": 0.012069177813827991,
      "rewards/reward_func/mean": 0.28361839056015015,
      "rewards/reward_func/std": 0.012069173157215118,
      "step": 1925,
      "step_time": 35.5646690428257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 366.0,
      "completions/max_terminated_length": 366.0,
      "completions/mean_length": 333.1875,
      "completions/mean_terminated_length": 333.1875,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "entropy": 0.19576743617653847,
      "epoch": 0.08920796665122742,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05160801112651825,
      "kl": 0.0010797584836836904,
      "learning_rate": 9.821676702176933e-07,
      "loss": -0.0007,
      "num_tokens": 52969106.0,
      "reward": 0.9488431215286255,
      "reward_std": 0.06820911169052124,
      "rewards/reward_func/mean": 0.9488431215286255,
      "rewards/reward_func/std": 0.06820911169052124,
      "step": 1926,
      "step_time": 33.76814239099622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 354.0,
      "completions/max_terminated_length": 354.0,
      "completions/mean_length": 312.9375,
      "completions/mean_terminated_length": 312.9375,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "entropy": 0.27120207250118256,
      "epoch": 0.08925428439092173,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07882967591285706,
      "kl": 0.0014733055722899735,
      "learning_rate": 9.821584066697544e-07,
      "loss": 0.0231,
      "num_tokens": 53009457.0,
      "reward": 0.7049738764762878,
      "reward_std": 0.3606720566749573,
      "rewards/reward_func/mean": 0.7049738764762878,
      "rewards/reward_func/std": 0.36067211627960205,
      "step": 1927,
      "step_time": 32.67887997999787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 157.875,
      "completions/mean_terminated_length": 157.875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.2225835844874382,
      "epoch": 0.08930060213061602,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13789547979831696,
      "kl": 0.00188472552690655,
      "learning_rate": 9.821491431218156e-07,
      "loss": -0.0418,
      "num_tokens": 53034111.0,
      "reward": 0.4438591003417969,
      "reward_std": 0.030712326988577843,
      "rewards/reward_func/mean": 0.4438591003417969,
      "rewards/reward_func/std": 0.030712325125932693,
      "step": 1928,
      "step_time": 16.75002347677946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 179.25,
      "completions/mean_terminated_length": 179.25,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.25280067324638367,
      "epoch": 0.08934691987031033,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014815613394603133,
      "kl": 0.0012445047905202955,
      "learning_rate": 9.821398795738767e-07,
      "loss": 0.0001,
      "num_tokens": 53057283.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 1929,
      "step_time": 21.23060030862689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 175.9375,
      "completions/mean_terminated_length": 175.9375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.21397727355360985,
      "epoch": 0.08939323761000463,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034141899086534977,
      "kl": 0.0017794068262446672,
      "learning_rate": 9.82130616025938e-07,
      "loss": 0.0001,
      "num_tokens": 53080754.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1930,
      "step_time": 18.86830211430788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 226.875,
      "completions/mean_terminated_length": 226.875,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.3228232190012932,
      "epoch": 0.08943955534969894,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09571702033281326,
      "kl": 0.003071622224524617,
      "learning_rate": 9.82121352477999e-07,
      "loss": -0.0601,
      "num_tokens": 53104832.0,
      "reward": 0.9278337955474854,
      "reward_std": 0.19719551503658295,
      "rewards/reward_func/mean": 0.9278337955474854,
      "rewards/reward_func/std": 0.19719550013542175,
      "step": 1931,
      "step_time": 22.584304578602314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 144.9375,
      "completions/mean_terminated_length": 144.9375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.30679020285606384,
      "epoch": 0.08948587308939324,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016486950917169452,
      "kl": 0.0012764577113557607,
      "learning_rate": 9.8211208893006e-07,
      "loss": 0.0001,
      "num_tokens": 53127343.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1932,
      "step_time": 15.787466999143362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.2917402759194374,
      "epoch": 0.08953219082908755,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09777598083019257,
      "kl": 0.0016675287624821067,
      "learning_rate": 9.821028253821214e-07,
      "loss": -0.0091,
      "num_tokens": 53152129.0,
      "reward": 0.5691647529602051,
      "reward_std": 0.07495664060115814,
      "rewards/reward_func/mean": 0.5691647529602051,
      "rewards/reward_func/std": 0.07495662569999695,
      "step": 1933,
      "step_time": 20.581589695066214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 192.3125,
      "completions/mean_terminated_length": 192.3125,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3954869881272316,
      "epoch": 0.08957850856878184,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0983789712190628,
      "kl": 0.002569092670455575,
      "learning_rate": 9.820935618341825e-07,
      "loss": -0.1034,
      "num_tokens": 53175238.0,
      "reward": 0.05313796177506447,
      "reward_std": 0.21255184710025787,
      "rewards/reward_func/mean": 0.05313796177506447,
      "rewards/reward_func/std": 0.21255184710025787,
      "step": 1934,
      "step_time": 24.094012692570686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 165.75,
      "completions/mean_terminated_length": 165.75,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.159866314381361,
      "epoch": 0.08962482630847615,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08072976022958755,
      "kl": 0.001008115039439872,
      "learning_rate": 9.820842982862437e-07,
      "loss": 0.0209,
      "num_tokens": 53200530.0,
      "reward": 0.8739733695983887,
      "reward_std": 0.0009956677677109838,
      "rewards/reward_func/mean": 0.8739733695983887,
      "rewards/reward_func/std": 0.0009956508874893188,
      "step": 1935,
      "step_time": 19.25437581166625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.37753038108348846,
      "epoch": 0.08967114404817045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014955189544707537,
      "kl": 0.0015476090193260461,
      "learning_rate": 9.820750347383048e-07,
      "loss": 0.0001,
      "num_tokens": 53230268.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1936,
      "step_time": 17.72052463889122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 120.375,
      "completions/mean_terminated_length": 120.375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.31560005247592926,
      "epoch": 0.08971746178786476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018230763962492347,
      "kl": 0.001572958513861522,
      "learning_rate": 9.82065771190366e-07,
      "loss": 0.0001,
      "num_tokens": 53252482.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1937,
      "step_time": 13.774052310734987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 164.125,
      "completions/mean_terminated_length": 164.125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.23324602842330933,
      "epoch": 0.08976377952755905,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029678151477128267,
      "kl": 0.002326177229406312,
      "learning_rate": 9.82056507642427e-07,
      "loss": 0.0001,
      "num_tokens": 53279348.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1938,
      "step_time": 17.855856452137232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 167.1875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.40140706300735474,
      "epoch": 0.08981009726725336,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002848844276741147,
      "kl": 0.0018949246150441468,
      "learning_rate": 9.820472440944882e-07,
      "loss": 0.0001,
      "num_tokens": 53310023.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1939,
      "step_time": 18.78799507766962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 207.6875,
      "completions/mean_terminated_length": 207.6875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2529890611767769,
      "epoch": 0.08985641500694766,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.103531114757061,
      "kl": 0.0018216021126136184,
      "learning_rate": 9.820379805465493e-07,
      "loss": -0.0469,
      "num_tokens": 53357458.0,
      "reward": 0.6555301547050476,
      "reward_std": 0.4592931568622589,
      "rewards/reward_func/mean": 0.6555301547050476,
      "rewards/reward_func/std": 0.4592931568622589,
      "step": 1940,
      "step_time": 28.453382831066847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 124.625,
      "completions/mean_terminated_length": 124.625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.24574385583400726,
      "epoch": 0.08990273274664197,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001512282993644476,
      "kl": 0.0012296900094952434,
      "learning_rate": 9.820287169986104e-07,
      "loss": 0.0001,
      "num_tokens": 53379116.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1941,
      "step_time": 14.23377349972725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 200.125,
      "completions/mean_terminated_length": 200.125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.32386118918657303,
      "epoch": 0.08994905048633627,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12431399524211884,
      "kl": 0.0027555939159356058,
      "learning_rate": 9.820194534506715e-07,
      "loss": -0.1047,
      "num_tokens": 53410830.0,
      "reward": 0.1294839084148407,
      "reward_std": 0.23305761814117432,
      "rewards/reward_func/mean": 0.1294839084148407,
      "rewards/reward_func/std": 0.23305761814117432,
      "step": 1942,
      "step_time": 26.170398607850075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 283.0,
      "completions/max_terminated_length": 283.0,
      "completions/mean_length": 228.25,
      "completions/mean_terminated_length": 228.25,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.43460363894701004,
      "epoch": 0.08999536822603058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07519284635782242,
      "kl": 0.0021609098184853792,
      "learning_rate": 9.820101899027327e-07,
      "loss": -0.0258,
      "num_tokens": 53441378.0,
      "reward": 0.005189642775803804,
      "reward_std": 0.020758571103215218,
      "rewards/reward_func/mean": 0.005189642775803804,
      "rewards/reward_func/std": 0.020758571103215218,
      "step": 1943,
      "step_time": 25.71977098658681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 143.25,
      "completions/mean_terminated_length": 143.25,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.31021514534950256,
      "epoch": 0.09004168596572487,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003140147076919675,
      "kl": 0.002087398723233491,
      "learning_rate": 9.820009263547938e-07,
      "loss": 0.0001,
      "num_tokens": 53462358.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1944,
      "step_time": 16.813846472650766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3413543477654457,
      "epoch": 0.09008800370541918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014058761298656464,
      "kl": 0.0013815262645948678,
      "learning_rate": 9.81991662806855e-07,
      "loss": 0.0001,
      "num_tokens": 53487681.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1945,
      "step_time": 17.097448244690895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 122.0,
      "completions/max_terminated_length": 122.0,
      "completions/mean_length": 106.0625,
      "completions/mean_terminated_length": 106.0625,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.22293299436569214,
      "epoch": 0.09013432144511348,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013684408040717244,
      "kl": 0.0012961396569153294,
      "learning_rate": 9.81982399258916e-07,
      "loss": 0.0001,
      "num_tokens": 53508018.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1946,
      "step_time": 12.469510365277529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 120.4375,
      "completions/mean_terminated_length": 120.4375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2610037550330162,
      "epoch": 0.09018063918480779,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009769543539732695,
      "kl": 0.0012118838203605264,
      "learning_rate": 9.819731357109774e-07,
      "loss": 0.0001,
      "num_tokens": 53528985.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1947,
      "step_time": 13.232079800218344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 148.6875,
      "completions/mean_terminated_length": 148.6875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.16816206276416779,
      "epoch": 0.09022695692450208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010869316756725311,
      "kl": 0.0008072914788499475,
      "learning_rate": 9.819638721630385e-07,
      "loss": 0.0,
      "num_tokens": 53550564.0,
      "reward": 0.24659696221351624,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.24659696221351624,
      "rewards/reward_func/std": 0.0,
      "step": 1948,
      "step_time": 16.953522082418203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 182.0,
      "completions/mean_terminated_length": 182.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.35463452339172363,
      "epoch": 0.09027327466419639,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12420963495969772,
      "kl": 0.003538883465807885,
      "learning_rate": 9.819546086150994e-07,
      "loss": -0.1319,
      "num_tokens": 53591716.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 1949,
      "step_time": 25.467616628855467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 203.5625,
      "completions/mean_terminated_length": 203.5625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.4176323562860489,
      "epoch": 0.09031959240389069,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003623420372605324,
      "kl": 0.0024453586665913463,
      "learning_rate": 9.819453450671607e-07,
      "loss": 0.0001,
      "num_tokens": 53622317.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1950,
      "step_time": 25.065615337342024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 147.8125,
      "completions/mean_terminated_length": 147.8125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.18409983441233635,
      "epoch": 0.090365910143585,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08783780038356781,
      "kl": 0.0014407682756427675,
      "learning_rate": 9.819360815192219e-07,
      "loss": 0.0017,
      "num_tokens": 53644554.0,
      "reward": 0.116437628865242,
      "reward_std": 0.008144183084368706,
      "rewards/reward_func/mean": 0.116437628865242,
      "rewards/reward_func/std": 0.008144183084368706,
      "step": 1951,
      "step_time": 17.291895169764757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 142.0625,
      "completions/mean_terminated_length": 142.0625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.21944910287857056,
      "epoch": 0.0904122278832793,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021472317166626453,
      "kl": 0.0013448151585180312,
      "learning_rate": 9.81926817971283e-07,
      "loss": 0.0001,
      "num_tokens": 53664331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1952,
      "step_time": 14.36643573269248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 116.625,
      "completions/mean_terminated_length": 116.625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.28818097710609436,
      "epoch": 0.0904585456229736,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002776138950139284,
      "kl": 0.0015259020728990436,
      "learning_rate": 9.819175544233441e-07,
      "loss": 0.0001,
      "num_tokens": 53686277.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1953,
      "step_time": 13.262791015207767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 170.4375,
      "completions/mean_terminated_length": 170.4375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.35676541924476624,
      "epoch": 0.0905048633626679,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019279010593891144,
      "kl": 0.001689134689513594,
      "learning_rate": 9.819082908754052e-07,
      "loss": 0.0001,
      "num_tokens": 53707836.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1954,
      "step_time": 18.223545279353857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 125.375,
      "completions/mean_terminated_length": 125.375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.25817713141441345,
      "epoch": 0.09055118110236221,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030821990221738815,
      "kl": 0.0015234506863635033,
      "learning_rate": 9.818990273274664e-07,
      "loss": 0.0001,
      "num_tokens": 53729122.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1955,
      "step_time": 13.691711734980345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 179.75,
      "completions/mean_terminated_length": 179.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.19519327953457832,
      "epoch": 0.0905974988420565,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08650083094835281,
      "kl": 0.0010984998807543889,
      "learning_rate": 9.818897637795275e-07,
      "loss": -0.0561,
      "num_tokens": 53755326.0,
      "reward": 0.6673440337181091,
      "reward_std": 0.011904047802090645,
      "rewards/reward_func/mean": 0.6673440337181091,
      "rewards/reward_func/std": 0.01190404687076807,
      "step": 1956,
      "step_time": 19.144754018634558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 197.75,
      "completions/mean_terminated_length": 197.75,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.13502133265137672,
      "epoch": 0.09064381658175082,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012273151660338044,
      "kl": 0.0008055913640419021,
      "learning_rate": 9.818805002315886e-07,
      "loss": 0.0,
      "num_tokens": 53777434.0,
      "reward": 0.9555630087852478,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9555630087852478,
      "rewards/reward_func/std": 0.0,
      "step": 1957,
      "step_time": 20.97706549987197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 367.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 300.5,
      "completions/mean_terminated_length": 300.5,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "entropy": 0.23473021015524864,
      "epoch": 0.09069013432144511,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09171711653470993,
      "kl": 0.0016145984409376979,
      "learning_rate": 9.818712366836497e-07,
      "loss": -0.021,
      "num_tokens": 53818418.0,
      "reward": 0.7842717170715332,
      "reward_std": 0.14640463888645172,
      "rewards/reward_func/mean": 0.7842717170715332,
      "rewards/reward_func/std": 0.14640463888645172,
      "step": 1958,
      "step_time": 33.88829968124628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 167.0625,
      "completions/mean_terminated_length": 167.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.41302254796028137,
      "epoch": 0.09073645206113942,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017865417758002877,
      "kl": 0.0017920045065693557,
      "learning_rate": 9.818619731357109e-07,
      "loss": 0.0001,
      "num_tokens": 53848755.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1959,
      "step_time": 19.698135547339916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 193.6875,
      "completions/mean_terminated_length": 193.6875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.28678224980831146,
      "epoch": 0.09078276980083372,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11251778155565262,
      "kl": 0.0018669200071599334,
      "learning_rate": 9.818527095877722e-07,
      "loss": -0.0109,
      "num_tokens": 53877726.0,
      "reward": 0.9071022272109985,
      "reward_std": 0.24333836138248444,
      "rewards/reward_func/mean": 0.9071022272109985,
      "rewards/reward_func/std": 0.24333836138248444,
      "step": 1960,
      "step_time": 21.394634574651718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 204.6875,
      "completions/mean_terminated_length": 204.6875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.2744165435433388,
      "epoch": 0.09082908754052803,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12224402278661728,
      "kl": 0.0029156223754398525,
      "learning_rate": 9.818434460398331e-07,
      "loss": -0.034,
      "num_tokens": 53904585.0,
      "reward": 0.5987157821655273,
      "reward_std": 0.1857866793870926,
      "rewards/reward_func/mean": 0.5987157821655273,
      "rewards/reward_func/std": 0.1857866793870926,
      "step": 1961,
      "step_time": 21.730876356363297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 171.125,
      "completions/mean_terminated_length": 171.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.23464681208133698,
      "epoch": 0.09087540528022232,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13402652740478516,
      "kl": 0.0016827634535729885,
      "learning_rate": 9.818341824918942e-07,
      "loss": -0.0029,
      "num_tokens": 53925691.0,
      "reward": 0.16108980774879456,
      "reward_std": 0.05650884658098221,
      "rewards/reward_func/mean": 0.16108980774879456,
      "rewards/reward_func/std": 0.05650884658098221,
      "step": 1962,
      "step_time": 18.06282015517354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 169.375,
      "completions/mean_terminated_length": 169.375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.36945798993110657,
      "epoch": 0.09092172301991663,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030708175618201494,
      "kl": 0.0022847213549539447,
      "learning_rate": 9.818249189439554e-07,
      "loss": 0.0001,
      "num_tokens": 53946689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1963,
      "step_time": 20.316118702292442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 208.9375,
      "completions/mean_terminated_length": 208.9375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.30894362181425095,
      "epoch": 0.09096804075961093,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015511916717514396,
      "kl": 0.0016433202545158565,
      "learning_rate": 9.818156553960167e-07,
      "loss": 0.0001,
      "num_tokens": 53973664.0,
      "reward": 0.5795782804489136,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5795782804489136,
      "rewards/reward_func/std": 0.0,
      "step": 1964,
      "step_time": 22.227172508835793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 119.3125,
      "completions/mean_terminated_length": 119.3125,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.2962167412042618,
      "epoch": 0.09101435849930524,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014858448412269354,
      "kl": 0.0013716779358219355,
      "learning_rate": 9.818063918480778e-07,
      "loss": 0.0001,
      "num_tokens": 53993429.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1965,
      "step_time": 14.127327963709831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 200.375,
      "completions/mean_terminated_length": 200.375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.2774844169616699,
      "epoch": 0.09106067623899954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011752174468711019,
      "kl": 0.0011603307939367369,
      "learning_rate": 9.81797128300139e-07,
      "loss": 0.0001,
      "num_tokens": 54019979.0,
      "reward": 0.4029351770877838,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4029351770877838,
      "rewards/reward_func/std": 0.0,
      "step": 1966,
      "step_time": 21.35994939506054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 159.25,
      "completions/mean_terminated_length": 159.25,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.24713463708758354,
      "epoch": 0.09110699397869385,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0783422440290451,
      "kl": 0.0015384795842692256,
      "learning_rate": 9.817878647522e-07,
      "loss": -0.0317,
      "num_tokens": 54041263.0,
      "reward": 0.8954063653945923,
      "reward_std": 0.03804173320531845,
      "rewards/reward_func/mean": 0.8954063653945923,
      "rewards/reward_func/std": 0.038041744381189346,
      "step": 1967,
      "step_time": 18.747099719941616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 108.125,
      "completions/mean_terminated_length": 108.125,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 0.3539968729019165,
      "epoch": 0.09115331171838814,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004241200629621744,
      "kl": 0.0021811652986798435,
      "learning_rate": 9.817786012042612e-07,
      "loss": 0.0001,
      "num_tokens": 54063969.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1968,
      "step_time": 13.496718242764473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 167.5625,
      "completions/mean_terminated_length": 167.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.35350824892520905,
      "epoch": 0.09119962945808245,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015196891035884619,
      "kl": 0.0016085615498013794,
      "learning_rate": 9.817693376563223e-07,
      "loss": 0.0001,
      "num_tokens": 54086602.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1969,
      "step_time": 19.123491693288088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 173.625,
      "completions/mean_terminated_length": 173.625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.1599022075533867,
      "epoch": 0.09124594719777675,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008308873511850834,
      "kl": 0.0009892965463222936,
      "learning_rate": 9.817600741083835e-07,
      "loss": 0.0,
      "num_tokens": 54112068.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 1970,
      "step_time": 19.56661333888769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 116.3125,
      "completions/mean_terminated_length": 116.3125,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.30716241151094437,
      "epoch": 0.09129226493747106,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005867612082511187,
      "kl": 0.0022151133161969483,
      "learning_rate": 9.817508105604446e-07,
      "loss": 0.0001,
      "num_tokens": 54132921.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1971,
      "step_time": 13.333259463310242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 188.625,
      "completions/mean_terminated_length": 188.625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.24652868881821632,
      "epoch": 0.09133858267716535,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011445103446021676,
      "kl": 0.001014224108075723,
      "learning_rate": 9.817415470125057e-07,
      "loss": 0.0001,
      "num_tokens": 54161907.0,
      "reward": 0.13406634330749512,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.13406634330749512,
      "rewards/reward_func/std": 0.0,
      "step": 1972,
      "step_time": 22.42594589293003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 134.0625,
      "completions/mean_terminated_length": 134.0625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.28645943105220795,
      "epoch": 0.09138490041685966,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019106051186099648,
      "kl": 0.0014066417643334717,
      "learning_rate": 9.81732283464567e-07,
      "loss": 0.0001,
      "num_tokens": 54183604.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1973,
      "step_time": 14.717470478266478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 171.3125,
      "completions/mean_terminated_length": 171.3125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.38657428324222565,
      "epoch": 0.09143121815655396,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011994995875284076,
      "kl": 0.0013932413421571255,
      "learning_rate": 9.81723019916628e-07,
      "loss": 0.0001,
      "num_tokens": 54211769.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1974,
      "step_time": 19.652615182101727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 199.0625,
      "completions/mean_terminated_length": 199.0625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.40833304822444916,
      "epoch": 0.09147753589624827,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002428983571007848,
      "kl": 0.0019206492870580405,
      "learning_rate": 9.81713756368689e-07,
      "loss": 0.0001,
      "num_tokens": 54236826.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1975,
      "step_time": 24.90197415649891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 167.8125,
      "completions/mean_terminated_length": 167.8125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4385068714618683,
      "epoch": 0.09152385363594256,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005276212468743324,
      "kl": 0.0030843618442304432,
      "learning_rate": 9.817044928207502e-07,
      "loss": 0.0002,
      "num_tokens": 54279607.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1976,
      "step_time": 24.52617084607482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 269.0625,
      "completions/mean_terminated_length": 269.0625,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "entropy": 0.24865829199552536,
      "epoch": 0.09157017137563687,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015469096833840013,
      "kl": 0.0015839740808587521,
      "learning_rate": 9.816952292728115e-07,
      "loss": 0.0001,
      "num_tokens": 54306488.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1977,
      "step_time": 26.93454695865512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 193.3125,
      "completions/mean_terminated_length": 193.3125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.2284526750445366,
      "epoch": 0.09161648911533117,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019630505703389645,
      "kl": 0.0016949988203123212,
      "learning_rate": 9.816859657248727e-07,
      "loss": 0.0001,
      "num_tokens": 54334221.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1978,
      "step_time": 20.993782050907612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 161.375,
      "completions/mean_terminated_length": 161.375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.39297013729810715,
      "epoch": 0.09166280685502548,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00118962861597538,
      "kl": 0.0014068300661165267,
      "learning_rate": 9.816767021769338e-07,
      "loss": 0.0001,
      "num_tokens": 54371091.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1979,
      "step_time": 20.093108519911766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 124.125,
      "completions/mean_terminated_length": 124.125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27579325065016747,
      "epoch": 0.09170912459471978,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003379854140803218,
      "kl": 0.001728180272039026,
      "learning_rate": 9.81667438628995e-07,
      "loss": 0.0001,
      "num_tokens": 54391109.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1980,
      "step_time": 13.318691533058882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 167.6875,
      "completions/mean_terminated_length": 167.6875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.1334142778068781,
      "epoch": 0.09175544233441409,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008702704217284918,
      "kl": 0.0007795136043569073,
      "learning_rate": 9.81658175081056e-07,
      "loss": 0.0,
      "num_tokens": 54426864.0,
      "reward": 0.8507331609725952,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8507331609725952,
      "rewards/reward_func/std": 0.0,
      "step": 1981,
      "step_time": 20.51024015620351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 175.0,
      "completions/mean_terminated_length": 175.0,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.33453385531902313,
      "epoch": 0.09180176007410838,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004719714168459177,
      "kl": 0.0031589337158948183,
      "learning_rate": 9.816489115331172e-07,
      "loss": 0.0002,
      "num_tokens": 54448624.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 1982,
      "step_time": 18.334271013736725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 117.5,
      "completions/mean_terminated_length": 117.5,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2871744707226753,
      "epoch": 0.09184807781380269,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010424979031085968,
      "kl": 0.0027162292681168765,
      "learning_rate": 9.816396479851783e-07,
      "loss": 0.0001,
      "num_tokens": 54468872.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1983,
      "step_time": 13.565622244030237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 188.0,
      "completions/mean_terminated_length": 188.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3009023070335388,
      "epoch": 0.09189439555349699,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019037388265132904,
      "kl": 0.001530302397441119,
      "learning_rate": 9.816303844372394e-07,
      "loss": 0.0001,
      "num_tokens": 54495240.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 1984,
      "step_time": 20.63035625964403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 198.4375,
      "completions/mean_terminated_length": 198.4375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.4101143106818199,
      "epoch": 0.0919407132931913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029017613269388676,
      "kl": 0.0018669950077310205,
      "learning_rate": 9.816211208893005e-07,
      "loss": 0.0001,
      "num_tokens": 54521071.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1985,
      "step_time": 22.225163273513317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 132.375,
      "completions/mean_terminated_length": 132.375,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2674574702978134,
      "epoch": 0.0919870310328856,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003091532038524747,
      "kl": 0.002030002244282514,
      "learning_rate": 9.816118573413617e-07,
      "loss": 0.0001,
      "num_tokens": 54540581.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1986,
      "step_time": 14.761275552213192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 151.375,
      "completions/mean_terminated_length": 151.375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.18644894286990166,
      "epoch": 0.0920333487725799,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00401868112385273,
      "kl": 0.0021452106011565775,
      "learning_rate": 9.816025937934228e-07,
      "loss": 0.0001,
      "num_tokens": 54561739.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 1987,
      "step_time": 17.11887515336275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 174.0625,
      "completions/mean_terminated_length": 174.0625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.1855168156325817,
      "epoch": 0.0920796665122742,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026354107540100813,
      "kl": 0.0016058009350672364,
      "learning_rate": 9.81593330245484e-07,
      "loss": 0.0001,
      "num_tokens": 54583148.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1988,
      "step_time": 18.430659186095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 113.0,
      "completions/mean_terminated_length": 113.0,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.22687219083309174,
      "epoch": 0.09212598425196851,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023138225078582764,
      "kl": 0.0012974188139196485,
      "learning_rate": 9.81584066697545e-07,
      "loss": 0.0001,
      "num_tokens": 54603756.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1989,
      "step_time": 12.908747110515833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 153.9375,
      "completions/mean_terminated_length": 153.9375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.408561073243618,
      "epoch": 0.0921723019916628,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014644854236394167,
      "kl": 0.0019203444826416671,
      "learning_rate": 9.815748031496064e-07,
      "loss": 0.0001,
      "num_tokens": 54652683.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1990,
      "step_time": 23.003245670348406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 197.125,
      "completions/mean_terminated_length": 197.125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.22864800691604614,
      "epoch": 0.09221861973135712,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11659272015094757,
      "kl": 0.0027175315481144935,
      "learning_rate": 9.815655396016675e-07,
      "loss": -0.0223,
      "num_tokens": 54677293.0,
      "reward": 0.38041621446609497,
      "reward_std": 0.4716108739376068,
      "rewards/reward_func/mean": 0.38041621446609497,
      "rewards/reward_func/std": 0.4716108739376068,
      "step": 1991,
      "step_time": 19.50218654796481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 217.0625,
      "completions/mean_terminated_length": 217.0625,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.19894057139754295,
      "epoch": 0.09226493747105141,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006900215987116098,
      "kl": 0.005521606304682791,
      "learning_rate": 9.815562760537284e-07,
      "loss": 0.0003,
      "num_tokens": 54709758.0,
      "reward": 0.7708956003189087,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7708956003189087,
      "rewards/reward_func/std": 0.0,
      "step": 1992,
      "step_time": 24.340532917529345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 173.625,
      "completions/mean_terminated_length": 173.625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.17384374886751175,
      "epoch": 0.09231125521074572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08830630779266357,
      "kl": 0.0021812051709275693,
      "learning_rate": 9.815470125057895e-07,
      "loss": -0.0088,
      "num_tokens": 54734440.0,
      "reward": 0.7111106514930725,
      "reward_std": 0.003023663302883506,
      "rewards/reward_func/mean": 0.7111106514930725,
      "rewards/reward_func/std": 0.003023657714948058,
      "step": 1993,
      "step_time": 18.362381052225828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 117.0,
      "completions/mean_terminated_length": 117.0,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.23678594082593918,
      "epoch": 0.09235757295044002,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025700826663523912,
      "kl": 0.001432108401786536,
      "learning_rate": 9.815377489578509e-07,
      "loss": 0.0001,
      "num_tokens": 54753720.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1994,
      "step_time": 13.111877344548702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 171.375,
      "completions/mean_terminated_length": 171.375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.15514958649873734,
      "epoch": 0.09240389069013433,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011596870608627796,
      "kl": 0.0009256289049517363,
      "learning_rate": 9.81528485409912e-07,
      "loss": 0.0,
      "num_tokens": 54799118.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 1995,
      "step_time": 24.76950002834201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 132.5,
      "completions/mean_terminated_length": 132.5,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.253715917468071,
      "epoch": 0.09245020842982862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013802764005959034,
      "kl": 0.0013290940260048956,
      "learning_rate": 9.815192218619731e-07,
      "loss": 0.0001,
      "num_tokens": 54820118.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1996,
      "step_time": 14.5656126588583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 143.9375,
      "completions/mean_terminated_length": 143.9375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.33426566421985626,
      "epoch": 0.09249652616952293,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002717754105105996,
      "kl": 0.0019190639141015708,
      "learning_rate": 9.815099583140342e-07,
      "loss": 0.0001,
      "num_tokens": 54841749.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1997,
      "step_time": 16.373453199863434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 136.375,
      "completions/mean_terminated_length": 136.375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.26538507640361786,
      "epoch": 0.09254284390921723,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002579397289082408,
      "kl": 0.0014253585104597732,
      "learning_rate": 9.815006947660954e-07,
      "loss": 0.0001,
      "num_tokens": 54863499.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1998,
      "step_time": 14.984016232192516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 143.5625,
      "completions/mean_terminated_length": 143.5625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2968463897705078,
      "epoch": 0.09258916164891154,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002322679152712226,
      "kl": 0.0015304088883567601,
      "learning_rate": 9.814914312181565e-07,
      "loss": 0.0001,
      "num_tokens": 54885268.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 1999,
      "step_time": 15.738748639822006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 154.8125,
      "completions/mean_terminated_length": 154.8125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.42615479975938797,
      "epoch": 0.09263547938860583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001267561805434525,
      "kl": 0.0015495356929022819,
      "learning_rate": 9.814821676702176e-07,
      "loss": 0.0001,
      "num_tokens": 54915249.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2000,
      "step_time": 18.564595259726048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 106.6875,
      "completions/mean_terminated_length": 106.6875,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.24032869935035706,
      "epoch": 0.09268179712830014,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011372538283467293,
      "kl": 0.0010614956117933616,
      "learning_rate": 9.814729041222787e-07,
      "loss": 0.0001,
      "num_tokens": 54935500.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2001,
      "step_time": 13.356164246797562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 204.75,
      "completions/mean_terminated_length": 204.75,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.2812078222632408,
      "epoch": 0.09272811486799444,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13774681091308594,
      "kl": 0.003768278518691659,
      "learning_rate": 9.814636405743399e-07,
      "loss": 0.0101,
      "num_tokens": 54973784.0,
      "reward": 0.7076573371887207,
      "reward_std": 0.0030280218925327063,
      "rewards/reward_func/mean": 0.7076573371887207,
      "rewards/reward_func/std": 0.0030280244536697865,
      "step": 2002,
      "step_time": 25.842976734042168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 150.0,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3747788146138191,
      "epoch": 0.09277443260768875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008774223388172686,
      "kl": 0.0013073801819700748,
      "learning_rate": 9.814543770264012e-07,
      "loss": 0.0001,
      "num_tokens": 55006136.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2003,
      "step_time": 18.33067310601473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 140.25,
      "completions/mean_terminated_length": 140.25,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.22896815836429596,
      "epoch": 0.09282075034738305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037381534930318594,
      "kl": 0.0016159997030626982,
      "learning_rate": 9.814451134784623e-07,
      "loss": 0.0001,
      "num_tokens": 55026828.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2004,
      "step_time": 15.401339266449213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 193.75,
      "completions/mean_terminated_length": 193.75,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.4159994348883629,
      "epoch": 0.09286706808707736,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026936079375445843,
      "kl": 0.0023532802588306367,
      "learning_rate": 9.814358499305232e-07,
      "loss": 0.0001,
      "num_tokens": 55051096.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2005,
      "step_time": 19.05104163661599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 188.125,
      "completions/mean_terminated_length": 188.125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.16379571333527565,
      "epoch": 0.09291338582677165,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007707058684900403,
      "kl": 0.0007123186805984005,
      "learning_rate": 9.814265863825844e-07,
      "loss": 0.0,
      "num_tokens": 55083994.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 2006,
      "step_time": 21.624755449593067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 155.3125,
      "completions/mean_terminated_length": 155.3125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.419580914080143,
      "epoch": 0.09295970356646596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015239104395732284,
      "kl": 0.002229607925983146,
      "learning_rate": 9.814173228346457e-07,
      "loss": 0.0001,
      "num_tokens": 55132879.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2007,
      "step_time": 23.54218227788806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 166.1875,
      "completions/mean_terminated_length": 166.1875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.2905442714691162,
      "epoch": 0.09300602130616026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15490469336509705,
      "kl": 0.003280369914136827,
      "learning_rate": 9.814080592867068e-07,
      "loss": 0.0328,
      "num_tokens": 55154722.0,
      "reward": 0.9940523505210876,
      "reward_std": 0.023790646344423294,
      "rewards/reward_func/mean": 0.9940523505210876,
      "rewards/reward_func/std": 0.023790642619132996,
      "step": 2008,
      "step_time": 19.209634006023407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 282.9375,
      "completions/mean_terminated_length": 282.9375,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "entropy": 0.2532089799642563,
      "epoch": 0.09305233904585457,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001870440086349845,
      "kl": 0.0015394982474390417,
      "learning_rate": 9.81398795738768e-07,
      "loss": 0.0001,
      "num_tokens": 55190353.0,
      "reward": 0.8887742161750793,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8887742161750793,
      "rewards/reward_func/std": 0.0,
      "step": 2009,
      "step_time": 29.07831295952201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 170.6875,
      "completions/mean_terminated_length": 170.6875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.21067959442734718,
      "epoch": 0.09309865678554886,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14396819472312927,
      "kl": 0.003258177952375263,
      "learning_rate": 9.81389532190829e-07,
      "loss": -0.0437,
      "num_tokens": 55213900.0,
      "reward": 0.8580853939056396,
      "reward_std": 0.1954721212387085,
      "rewards/reward_func/mean": 0.8580853939056396,
      "rewards/reward_func/std": 0.1954721212387085,
      "step": 2010,
      "step_time": 18.470594085752964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 163.25,
      "completions/mean_terminated_length": 163.25,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2205018289387226,
      "epoch": 0.09314497452524317,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013772568199783564,
      "kl": 0.0012428172049112618,
      "learning_rate": 9.813802686428902e-07,
      "loss": 0.0001,
      "num_tokens": 55245296.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 2011,
      "step_time": 19.049249719828367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 177.1875,
      "completions/mean_terminated_length": 177.1875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.174911018460989,
      "epoch": 0.09319129226493747,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011804018868133426,
      "kl": 0.0010020088957389817,
      "learning_rate": 9.813710050949513e-07,
      "loss": 0.0,
      "num_tokens": 55266467.0,
      "reward": 0.747017502784729,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.747017502784729,
      "rewards/reward_func/std": 0.0,
      "step": 2012,
      "step_time": 17.20403290167451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 172.875,
      "completions/mean_terminated_length": 172.875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.16459906101226807,
      "epoch": 0.09323761000463178,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16722524166107178,
      "kl": 0.004344968969235197,
      "learning_rate": 9.813617415470125e-07,
      "loss": -0.1407,
      "num_tokens": 55287713.0,
      "reward": 0.32027894258499146,
      "reward_std": 0.3841894567012787,
      "rewards/reward_func/mean": 0.32027894258499146,
      "rewards/reward_func/std": 0.3841894567012787,
      "step": 2013,
      "step_time": 19.53892307356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 206.5,
      "completions/mean_terminated_length": 206.5,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.16332386806607246,
      "epoch": 0.09328392774432608,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08634932339191437,
      "kl": 0.001094053266569972,
      "learning_rate": 9.813524779990736e-07,
      "loss": -0.0198,
      "num_tokens": 55311961.0,
      "reward": 0.9664702415466309,
      "reward_std": 0.023347126320004463,
      "rewards/reward_func/mean": 0.9664702415466309,
      "rewards/reward_func/std": 0.023347120732069016,
      "step": 2014,
      "step_time": 20.719892770051956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 266.75,
      "completions/mean_terminated_length": 266.75,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "entropy": 0.2889692559838295,
      "epoch": 0.09333024548402039,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0823928490281105,
      "kl": 0.0021612555137835443,
      "learning_rate": 9.813432144511347e-07,
      "loss": -0.0869,
      "num_tokens": 55344389.0,
      "reward": 0.5739880800247192,
      "reward_std": 0.196987584233284,
      "rewards/reward_func/mean": 0.5739880800247192,
      "rewards/reward_func/std": 0.196987584233284,
      "step": 2015,
      "step_time": 27.850818529725075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 232.4375,
      "completions/mean_terminated_length": 232.4375,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.20757009834051132,
      "epoch": 0.09337656322371468,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0711066871881485,
      "kl": 0.002094621246214956,
      "learning_rate": 9.813339509031958e-07,
      "loss": -0.0323,
      "num_tokens": 55369884.0,
      "reward": 0.7875936031341553,
      "reward_std": 0.10538309067487717,
      "rewards/reward_func/mean": 0.7875936031341553,
      "rewards/reward_func/std": 0.10538309812545776,
      "step": 2016,
      "step_time": 23.083560083061457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 196.3125,
      "completions/mean_terminated_length": 196.3125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.331020712852478,
      "epoch": 0.09342288096340899,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11179359257221222,
      "kl": 0.004218329850118607,
      "learning_rate": 9.81324687355257e-07,
      "loss": -0.0371,
      "num_tokens": 55407265.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2017,
      "step_time": 24.803230065852404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 183.375,
      "completions/mean_terminated_length": 183.375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.40553393214941025,
      "epoch": 0.09346919870310329,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002806166186928749,
      "kl": 0.001957463100552559,
      "learning_rate": 9.81315423807318e-07,
      "loss": 0.0001,
      "num_tokens": 55433991.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2018,
      "step_time": 19.76413144916296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3953156992793083,
      "epoch": 0.0935155164427976,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001481954357586801,
      "kl": 0.0014554686786141247,
      "learning_rate": 9.813061602593792e-07,
      "loss": 0.0001,
      "num_tokens": 55458078.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2019,
      "step_time": 20.22554812580347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 193.0625,
      "completions/mean_terminated_length": 193.0625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4021492078900337,
      "epoch": 0.0935618341824919,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005300453864037991,
      "kl": 0.003082145005464554,
      "learning_rate": 9.812968967114405e-07,
      "loss": 0.0002,
      "num_tokens": 55481983.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2020,
      "step_time": 20.60504487901926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 191.4375,
      "completions/mean_terminated_length": 191.4375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.40075282752513885,
      "epoch": 0.0936081519221862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013799435691908002,
      "kl": 0.0014147369656711817,
      "learning_rate": 9.812876331635017e-07,
      "loss": 0.0001,
      "num_tokens": 55512950.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2021,
      "step_time": 21.51679801568389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 197.125,
      "completions/mean_terminated_length": 197.125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.18907545879483223,
      "epoch": 0.0936544696618805,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12522315979003906,
      "kl": 0.0015172976854955778,
      "learning_rate": 9.812783696155628e-07,
      "loss": 0.035,
      "num_tokens": 55534296.0,
      "reward": 0.9603575468063354,
      "reward_std": 0.05285662040114403,
      "rewards/reward_func/mean": 0.9603575468063354,
      "rewards/reward_func/std": 0.052856624126434326,
      "step": 2022,
      "step_time": 20.253864627331495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 124.1875,
      "completions/mean_terminated_length": 124.1875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.34351880848407745,
      "epoch": 0.09370078740157481,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025619296357035637,
      "kl": 0.0018770191818475723,
      "learning_rate": 9.812691060676237e-07,
      "loss": 0.0001,
      "num_tokens": 55558219.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2023,
      "step_time": 14.198850486427546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 325.0,
      "completions/max_terminated_length": 325.0,
      "completions/mean_length": 185.125,
      "completions/mean_terminated_length": 185.125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.4475076347589493,
      "epoch": 0.0937471051412691,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033801370300352573,
      "kl": 0.002179990289732814,
      "learning_rate": 9.81259842519685e-07,
      "loss": 0.0001,
      "num_tokens": 55590413.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2024,
      "step_time": 29.04584624245763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 218.9375,
      "completions/mean_terminated_length": 218.9375,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.23236701637506485,
      "epoch": 0.09379342288096341,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09577567130327225,
      "kl": 0.0035261387238278985,
      "learning_rate": 9.812505789717462e-07,
      "loss": -0.037,
      "num_tokens": 55613724.0,
      "reward": 0.7402232885360718,
      "reward_std": 0.2639003396034241,
      "rewards/reward_func/mean": 0.7402232885360718,
      "rewards/reward_func/std": 0.2639003396034241,
      "step": 2025,
      "step_time": 21.337419539690018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 134.75,
      "completions/mean_terminated_length": 134.75,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.33088279515504837,
      "epoch": 0.09383974062065771,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004115608055144548,
      "kl": 0.0020482465624809265,
      "learning_rate": 9.812413154238073e-07,
      "loss": 0.0001,
      "num_tokens": 55639240.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2026,
      "step_time": 15.747435353696346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 201.5625,
      "completions/mean_terminated_length": 201.5625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.4111289530992508,
      "epoch": 0.09388605836035202,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10355306416749954,
      "kl": 0.00337825994938612,
      "learning_rate": 9.812320518758684e-07,
      "loss": 0.0917,
      "num_tokens": 55674609.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2027,
      "step_time": 26.33951948583126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 293.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 244.0625,
      "completions/mean_terminated_length": 244.0625,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.3604082316160202,
      "epoch": 0.09393237610004632,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08023695647716522,
      "kl": 0.0027463238802738488,
      "learning_rate": 9.812227883279295e-07,
      "loss": 0.0023,
      "num_tokens": 55712722.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2028,
      "step_time": 27.899784050881863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 180.375,
      "completions/mean_terminated_length": 180.375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.31885142624378204,
      "epoch": 0.09397869383974063,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11050109565258026,
      "kl": 0.002334500750293955,
      "learning_rate": 9.812135247799907e-07,
      "loss": -0.0501,
      "num_tokens": 55736184.0,
      "reward": 0.6500093936920166,
      "reward_std": 0.32208919525146484,
      "rewards/reward_func/mean": 0.6500093936920166,
      "rewards/reward_func/std": 0.32208922505378723,
      "step": 2029,
      "step_time": 18.724287275224924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 187.125,
      "completions/mean_terminated_length": 187.125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3281235545873642,
      "epoch": 0.09402501157943492,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12246488779783249,
      "kl": 0.005524210748262703,
      "learning_rate": 9.812042612320518e-07,
      "loss": -0.0006,
      "num_tokens": 55757050.0,
      "reward": 0.8068915009498596,
      "reward_std": 0.21836881339550018,
      "rewards/reward_func/mean": 0.8068915009498596,
      "rewards/reward_func/std": 0.21836881339550018,
      "step": 2030,
      "step_time": 20.11217812821269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 202.875,
      "completions/mean_terminated_length": 202.875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.24049953371286392,
      "epoch": 0.09407132931912923,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015947173815220594,
      "kl": 0.0012396015226840973,
      "learning_rate": 9.81194997684113e-07,
      "loss": 0.0001,
      "num_tokens": 55783256.0,
      "reward": 0.9161604642868042,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9161604642868042,
      "rewards/reward_func/std": 0.0,
      "step": 2031,
      "step_time": 21.03316890448332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 198.6875,
      "completions/mean_terminated_length": 198.6875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.2781083658337593,
      "epoch": 0.09411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09110790491104126,
      "kl": 0.001261789002455771,
      "learning_rate": 9.81185734136174e-07,
      "loss": 0.1084,
      "num_tokens": 55824419.0,
      "reward": 0.7620233297348022,
      "reward_std": 0.2957080006599426,
      "rewards/reward_func/mean": 0.7620233297348022,
      "rewards/reward_func/std": 0.2957080006599426,
      "step": 2032,
      "step_time": 26.385674338787794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 191.3125,
      "completions/mean_terminated_length": 191.3125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.20272637158632278,
      "epoch": 0.09416396479851784,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07322783023118973,
      "kl": 0.0009164645161945373,
      "learning_rate": 9.811764705882352e-07,
      "loss": 0.0114,
      "num_tokens": 55860968.0,
      "reward": 0.8565701246261597,
      "reward_std": 0.002278505591675639,
      "rewards/reward_func/mean": 0.8565701246261597,
      "rewards/reward_func/std": 0.0022785027977079153,
      "step": 2033,
      "step_time": 24.84770367667079
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 170.5625,
      "completions/mean_terminated_length": 170.5625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.39616280794143677,
      "epoch": 0.09421028253821213,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0078053586184978485,
      "kl": 0.002060381375486031,
      "learning_rate": 9.811672070402965e-07,
      "loss": 0.0001,
      "num_tokens": 55911825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2034,
      "step_time": 24.6925237365067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 236.5,
      "completions/mean_terminated_length": 236.5,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "entropy": 0.3319804519414902,
      "epoch": 0.09425660027790644,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10468065738677979,
      "kl": 0.0035973970079794526,
      "learning_rate": 9.811579434923574e-07,
      "loss": -0.0125,
      "num_tokens": 55945785.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 2035,
      "step_time": 24.75483187288046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 203.0625,
      "completions/mean_terminated_length": 203.0625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.43262146413326263,
      "epoch": 0.09430291801760074,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0937405377626419,
      "kl": 0.0019505124655552208,
      "learning_rate": 9.811486799444185e-07,
      "loss": 0.1206,
      "num_tokens": 55968362.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2036,
      "step_time": 25.007275737822056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 200.25,
      "completions/mean_terminated_length": 200.25,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.15093117579817772,
      "epoch": 0.09434923575729505,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08309942483901978,
      "kl": 0.0011663798068184406,
      "learning_rate": 9.811394163964799e-07,
      "loss": 0.0106,
      "num_tokens": 55999070.0,
      "reward": 0.9607253074645996,
      "reward_std": 0.02734741009771824,
      "rewards/reward_func/mean": 0.9607253074645996,
      "rewards/reward_func/std": 0.027347413823008537,
      "step": 2037,
      "step_time": 21.52350740507245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 160.375,
      "completions/mean_terminated_length": 160.375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.37679579854011536,
      "epoch": 0.09439555349698935,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013516127364709973,
      "kl": 0.0016926408861763775,
      "learning_rate": 9.81130152848541e-07,
      "loss": 0.0001,
      "num_tokens": 56034324.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2038,
      "step_time": 20.581428475677967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 152.6875,
      "completions/mean_terminated_length": 152.6875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.36912456154823303,
      "epoch": 0.09444187123668366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032714083790779114,
      "kl": 0.0026178762200288475,
      "learning_rate": 9.811208893006021e-07,
      "loss": 0.0001,
      "num_tokens": 56079519.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2039,
      "step_time": 21.89334412664175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 185.375,
      "completions/mean_terminated_length": 185.375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.3762866109609604,
      "epoch": 0.09448818897637795,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023587944451719522,
      "kl": 0.0018823669233825058,
      "learning_rate": 9.811116257526633e-07,
      "loss": 0.0001,
      "num_tokens": 56112933.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2040,
      "step_time": 21.799803376197815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 146.5625,
      "completions/mean_terminated_length": 146.5625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.26305900514125824,
      "epoch": 0.09453450671607226,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00414580712094903,
      "kl": 0.0029761113109998405,
      "learning_rate": 9.811023622047244e-07,
      "loss": 0.0001,
      "num_tokens": 56132894.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2041,
      "step_time": 16.481983814388514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 123.9375,
      "completions/mean_terminated_length": 123.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2890935614705086,
      "epoch": 0.09458082445576656,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013513185549527407,
      "kl": 0.001297875598538667,
      "learning_rate": 9.810930986567855e-07,
      "loss": 0.0001,
      "num_tokens": 56152541.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2042,
      "step_time": 13.403200272470713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 135.1875,
      "completions/mean_terminated_length": 135.1875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3202582746744156,
      "epoch": 0.09462714219546087,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020696830470114946,
      "kl": 0.0016303090669680387,
      "learning_rate": 9.810838351088466e-07,
      "loss": 0.0001,
      "num_tokens": 56182896.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2043,
      "step_time": 17.72435461357236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 130.5,
      "completions/mean_terminated_length": 130.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.35065240412950516,
      "epoch": 0.09467345993515516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010780496522784233,
      "kl": 0.0011586417676880956,
      "learning_rate": 9.810745715609078e-07,
      "loss": 0.0001,
      "num_tokens": 56206344.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2044,
      "step_time": 15.890535064041615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 166.0625,
      "completions/mean_terminated_length": 166.0625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3423704281449318,
      "epoch": 0.09471977767484947,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012414716184139252,
      "kl": 0.0013426708756014705,
      "learning_rate": 9.810653080129689e-07,
      "loss": 0.0001,
      "num_tokens": 56236041.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2045,
      "step_time": 19.825654160231352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 168.25,
      "completions/mean_terminated_length": 168.25,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.2755350135266781,
      "epoch": 0.09476609541454377,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035304257180541754,
      "kl": 0.0024147378862835467,
      "learning_rate": 9.8105604446503e-07,
      "loss": 0.0001,
      "num_tokens": 56257853.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2046,
      "step_time": 18.540200740098953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 127.25,
      "completions/mean_terminated_length": 127.25,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.34416310489177704,
      "epoch": 0.09481241315423808,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022126396652311087,
      "kl": 0.0016846568905748427,
      "learning_rate": 9.810467809170913e-07,
      "loss": 0.0001,
      "num_tokens": 56283969.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2047,
      "step_time": 17.07555378228426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 311.6875,
      "completions/mean_terminated_length": 311.6875,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "entropy": 0.22475888207554817,
      "epoch": 0.09485873089393237,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07333046942949295,
      "kl": 0.002175509522203356,
      "learning_rate": 9.810375173691523e-07,
      "loss": -0.0462,
      "num_tokens": 56324796.0,
      "reward": 0.8022526502609253,
      "reward_std": 0.1691083461046219,
      "rewards/reward_func/mean": 0.8022526502609253,
      "rewards/reward_func/std": 0.1691083461046219,
      "step": 2048,
      "step_time": 33.69538462534547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 141.8125,
      "completions/mean_terminated_length": 141.8125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.37307263165712357,
      "epoch": 0.09490504863362668,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0072221108712255955,
      "kl": 0.004154925467446446,
      "learning_rate": 9.810282538212134e-07,
      "loss": 0.0002,
      "num_tokens": 56345609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2049,
      "step_time": 15.079968519508839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 138.75,
      "completions/mean_terminated_length": 138.75,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.304686039686203,
      "epoch": 0.09495136637332098,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034532558638602495,
      "kl": 0.0017407748964615166,
      "learning_rate": 9.810189902732747e-07,
      "loss": 0.0001,
      "num_tokens": 56365893.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2050,
      "step_time": 14.859419286251068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 125.6875,
      "completions/mean_terminated_length": 125.6875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2114924117922783,
      "epoch": 0.09499768411301529,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02498413622379303,
      "kl": 0.004018052568426356,
      "learning_rate": 9.810097267253358e-07,
      "loss": 0.0002,
      "num_tokens": 56385392.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2051,
      "step_time": 13.88862270489335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 197.0625,
      "completions/mean_terminated_length": 197.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.4702746123075485,
      "epoch": 0.09504400185270959,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017258214065805078,
      "kl": 0.0020685320487245917,
      "learning_rate": 9.81000463177397e-07,
      "loss": 0.0001,
      "num_tokens": 56414881.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2052,
      "step_time": 23.93411250412464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 166.125,
      "completions/mean_terminated_length": 166.125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.35899273306131363,
      "epoch": 0.0950903195924039,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014317986788228154,
      "kl": 0.0013684245059266686,
      "learning_rate": 9.80991199629458e-07,
      "loss": 0.0001,
      "num_tokens": 56437347.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2053,
      "step_time": 17.814839605242014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 138.6875,
      "completions/mean_terminated_length": 138.6875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.31955157220363617,
      "epoch": 0.09513663733209819,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013007436646148562,
      "kl": 0.0011292924464214593,
      "learning_rate": 9.809819360815192e-07,
      "loss": 0.0001,
      "num_tokens": 56464094.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2054,
      "step_time": 16.677923016250134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 157.75,
      "completions/mean_terminated_length": 157.75,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3834758996963501,
      "epoch": 0.0951829550717925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012701147934421897,
      "kl": 0.0014297603629529476,
      "learning_rate": 9.809726725335803e-07,
      "loss": 0.0001,
      "num_tokens": 56499690.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2055,
      "step_time": 20.412053678184748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 149.375,
      "completions/mean_terminated_length": 149.375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3550326004624367,
      "epoch": 0.0952292728114868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015697143971920013,
      "kl": 0.0015536470455117524,
      "learning_rate": 9.809634089856415e-07,
      "loss": 0.0001,
      "num_tokens": 56525216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2056,
      "step_time": 18.424117360264063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 137.875,
      "completions/mean_terminated_length": 137.875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.30019643902778625,
      "epoch": 0.09527559055118111,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014886199496686459,
      "kl": 0.0015041444275993854,
      "learning_rate": 9.809541454377026e-07,
      "loss": 0.0001,
      "num_tokens": 56547150.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2057,
      "step_time": 16.6746284365654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 361.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 257.9375,
      "completions/mean_terminated_length": 257.9375,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.337487168610096,
      "epoch": 0.0953219082908754,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07086921483278275,
      "kl": 0.0030414532811846584,
      "learning_rate": 9.809448818897637e-07,
      "loss": -0.1511,
      "num_tokens": 56575741.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 2058,
      "step_time": 30.399008128792048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 118.75,
      "completions/mean_terminated_length": 118.75,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.23380812257528305,
      "epoch": 0.09536822603056971,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008936121012084186,
      "kl": 0.0009265767293982208,
      "learning_rate": 9.809356183418248e-07,
      "loss": 0.0,
      "num_tokens": 56595721.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2059,
      "step_time": 13.686347719281912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 159.1875,
      "completions/mean_terminated_length": 159.1875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.25653551146388054,
      "epoch": 0.09541454377026401,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008221469819545746,
      "kl": 0.003456158301560208,
      "learning_rate": 9.80926354793886e-07,
      "loss": 0.0002,
      "num_tokens": 56619692.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 2060,
      "step_time": 17.168593287467957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 194.625,
      "completions/mean_terminated_length": 194.625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.1649177223443985,
      "epoch": 0.09546086150995832,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010986204724758863,
      "kl": 0.001024660756229423,
      "learning_rate": 9.80917091245947e-07,
      "loss": 0.0001,
      "num_tokens": 56650406.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2061,
      "step_time": 21.216629676520824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 223.5625,
      "completions/mean_terminated_length": 223.5625,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.30768416076898575,
      "epoch": 0.09550717924965262,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07383064180612564,
      "kl": 0.003608456638175994,
      "learning_rate": 9.809078276980082e-07,
      "loss": -0.0686,
      "num_tokens": 56672831.0,
      "reward": 0.3277292549610138,
      "reward_std": 0.38379722833633423,
      "rewards/reward_func/mean": 0.3277292549610138,
      "rewards/reward_func/std": 0.38379722833633423,
      "step": 2062,
      "step_time": 25.07394739612937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 236.1875,
      "completions/mean_terminated_length": 236.1875,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.184597447514534,
      "epoch": 0.09555349698934693,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11104031652212143,
      "kl": 0.0008925462025217712,
      "learning_rate": 9.808985641500693e-07,
      "loss": 0.0104,
      "num_tokens": 56707810.0,
      "reward": 0.9944300055503845,
      "reward_std": 0.022280026227235794,
      "rewards/reward_func/mean": 0.9944300055503845,
      "rewards/reward_func/std": 0.022280022501945496,
      "step": 2063,
      "step_time": 26.623209707438946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 169.1875,
      "completions/mean_terminated_length": 169.1875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.14013603702187538,
      "epoch": 0.09559981472904122,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011042552068829536,
      "kl": 0.0006901939486851916,
      "learning_rate": 9.808893006021307e-07,
      "loss": 0.0,
      "num_tokens": 56745749.0,
      "reward": 0.8890097737312317,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8890097737312317,
      "rewards/reward_func/std": 0.0,
      "step": 2064,
      "step_time": 21.75394108146429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 131.125,
      "completions/mean_terminated_length": 131.125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.35413843393325806,
      "epoch": 0.09564613246873553,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026031648740172386,
      "kl": 0.002009792427998036,
      "learning_rate": 9.808800370541918e-07,
      "loss": 0.0001,
      "num_tokens": 56766775.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2065,
      "step_time": 14.743171103298664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 134.25,
      "completions/mean_terminated_length": 134.25,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.29190149903297424,
      "epoch": 0.09569245020842983,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034215168561786413,
      "kl": 0.0023275664425455034,
      "learning_rate": 9.808707735062527e-07,
      "loss": 0.0001,
      "num_tokens": 56796123.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2066,
      "step_time": 16.022603794932365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 202.5,
      "completions/mean_terminated_length": 202.5,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.4320342019200325,
      "epoch": 0.09573876794812414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002367663662880659,
      "kl": 0.0018327873258385807,
      "learning_rate": 9.80861509958314e-07,
      "loss": 0.0001,
      "num_tokens": 56823603.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2067,
      "step_time": 22.358512055128813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 139.1875,
      "completions/mean_terminated_length": 139.1875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2488016001880169,
      "epoch": 0.09578508568781843,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17417486011981964,
      "kl": 0.004339932158472948,
      "learning_rate": 9.808522464103752e-07,
      "loss": -0.0446,
      "num_tokens": 56850166.0,
      "reward": 0.9193795919418335,
      "reward_std": 0.15853877365589142,
      "rewards/reward_func/mean": 0.9193795919418335,
      "rewards/reward_func/std": 0.15853877365589142,
      "step": 2068,
      "step_time": 16.58432499691844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 180.8125,
      "completions/mean_terminated_length": 180.8125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.23132160678505898,
      "epoch": 0.09583140342751274,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10905469954013824,
      "kl": 0.001088093180442229,
      "learning_rate": 9.808429828624363e-07,
      "loss": 0.0067,
      "num_tokens": 56889859.0,
      "reward": 0.8267015218734741,
      "reward_std": 0.021780284121632576,
      "rewards/reward_func/mean": 0.8267015218734741,
      "rewards/reward_func/std": 0.02178027853369713,
      "step": 2069,
      "step_time": 23.202710587531328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 220.3125,
      "completions/mean_terminated_length": 220.3125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.2583886757493019,
      "epoch": 0.09587772116720704,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002973136492073536,
      "kl": 0.0019069083500653505,
      "learning_rate": 9.808337193144974e-07,
      "loss": 0.0001,
      "num_tokens": 56915464.0,
      "reward": 0.7105904221534729,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7105904221534729,
      "rewards/reward_func/std": 0.0,
      "step": 2070,
      "step_time": 23.26581984013319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 190.3125,
      "completions/mean_terminated_length": 190.3125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.40494538098573685,
      "epoch": 0.09592403890690135,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037858253344893456,
      "kl": 0.002681231824681163,
      "learning_rate": 9.808244557665585e-07,
      "loss": 0.0001,
      "num_tokens": 56940269.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2071,
      "step_time": 21.089487422257662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 130.0,
      "completions/mean_terminated_length": 130.0,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.21866579353809357,
      "epoch": 0.09597035664659564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031161820515990257,
      "kl": 0.001894166081910953,
      "learning_rate": 9.808151922186197e-07,
      "loss": 0.0001,
      "num_tokens": 56959869.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2072,
      "step_time": 13.873397447168827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 125.25,
      "completions/mean_terminated_length": 125.25,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.22946714982390404,
      "epoch": 0.09601667438628995,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005955255590379238,
      "kl": 0.0018666211108211428,
      "learning_rate": 9.808059286706808e-07,
      "loss": 0.0001,
      "num_tokens": 56979249.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2073,
      "step_time": 13.370041579008102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 163.8125,
      "completions/mean_terminated_length": 163.8125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3609740734100342,
      "epoch": 0.09606299212598425,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018750651506707072,
      "kl": 0.001974839164176956,
      "learning_rate": 9.80796665122742e-07,
      "loss": 0.0001,
      "num_tokens": 57000014.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2074,
      "step_time": 17.314895667135715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 137.25,
      "completions/mean_terminated_length": 137.25,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.341608501970768,
      "epoch": 0.09610930986567856,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021006171591579914,
      "kl": 0.0016438520688097924,
      "learning_rate": 9.80787401574803e-07,
      "loss": 0.0001,
      "num_tokens": 57023938.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2075,
      "step_time": 15.84898490831256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 138.4375,
      "completions/mean_terminated_length": 138.4375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.20013650879263878,
      "epoch": 0.09615562760537286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005079299211502075,
      "kl": 0.002640404738485813,
      "learning_rate": 9.807781380268642e-07,
      "loss": 0.0001,
      "num_tokens": 57043641.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2076,
      "step_time": 14.947165336459875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 148.875,
      "completions/mean_terminated_length": 148.875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.21334562450647354,
      "epoch": 0.09620194534506717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010591655736789107,
      "kl": 0.0010537790512898937,
      "learning_rate": 9.807688744789255e-07,
      "loss": 0.0001,
      "num_tokens": 57072295.0,
      "reward": 0.7425271272659302,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7425271272659302,
      "rewards/reward_func/std": 0.0,
      "step": 2077,
      "step_time": 16.897052317857742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3212684392929077,
      "epoch": 0.09624826308476146,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001615766086615622,
      "kl": 0.0015198698383755982,
      "learning_rate": 9.807596109309864e-07,
      "loss": 0.0001,
      "num_tokens": 57097791.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2078,
      "step_time": 16.013172037899494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 170.625,
      "completions/mean_terminated_length": 170.625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.16729550063610077,
      "epoch": 0.09629458082445577,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001133670681156218,
      "kl": 0.0008856083150021732,
      "learning_rate": 9.807503473830475e-07,
      "loss": 0.0,
      "num_tokens": 57121849.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 2079,
      "step_time": 17.570480413734913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 145.375,
      "completions/mean_terminated_length": 145.375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.1947532631456852,
      "epoch": 0.09634089856415007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004643620457500219,
      "kl": 0.002088828943669796,
      "learning_rate": 9.807410838351089e-07,
      "loss": 0.0001,
      "num_tokens": 57147743.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 2080,
      "step_time": 16.442312948405743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 139.9375,
      "completions/mean_terminated_length": 139.9375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3246047720313072,
      "epoch": 0.09638721630384438,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006020332686603069,
      "kl": 0.003244896070100367,
      "learning_rate": 9.8073182028717e-07,
      "loss": 0.0002,
      "num_tokens": 57169326.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2081,
      "step_time": 16.652198139578104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 198.4375,
      "completions/mean_terminated_length": 198.4375,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.22148482128977776,
      "epoch": 0.09643353404353867,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012353778583928943,
      "kl": 0.001151223696069792,
      "learning_rate": 9.807225567392311e-07,
      "loss": 0.0001,
      "num_tokens": 57205141.0,
      "reward": 0.5623413324356079,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5623413324356079,
      "rewards/reward_func/std": 0.0,
      "step": 2082,
      "step_time": 22.177756395190954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 181.625,
      "completions/mean_terminated_length": 181.625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.21318692713975906,
      "epoch": 0.09647985178323298,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11407241225242615,
      "kl": 0.0032745692878961563,
      "learning_rate": 9.807132931912923e-07,
      "loss": -0.0443,
      "num_tokens": 57230063.0,
      "reward": 0.7006678581237793,
      "reward_std": 0.07982190698385239,
      "rewards/reward_func/mean": 0.7006678581237793,
      "rewards/reward_func/std": 0.07982189953327179,
      "step": 2083,
      "step_time": 20.12787677720189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 123.125,
      "completions/mean_terminated_length": 123.125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2635095790028572,
      "epoch": 0.09652616952292728,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025622595567256212,
      "kl": 0.0018459637358319014,
      "learning_rate": 9.807040296433534e-07,
      "loss": 0.0001,
      "num_tokens": 57252145.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2084,
      "step_time": 13.830753143876791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 158.1875,
      "completions/mean_terminated_length": 158.1875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3883274719119072,
      "epoch": 0.09657248726262159,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00164109468460083,
      "kl": 0.001676585408858955,
      "learning_rate": 9.806947660954145e-07,
      "loss": 0.0001,
      "num_tokens": 57285220.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2085,
      "step_time": 19.91367145255208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 190.1875,
      "completions/mean_terminated_length": 190.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.4454917311668396,
      "epoch": 0.09661880500231589,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00665110070258379,
      "kl": 0.004151255008764565,
      "learning_rate": 9.806855025474756e-07,
      "loss": 0.0002,
      "num_tokens": 57313207.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2086,
      "step_time": 24.15814983472228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 190.25,
      "completions/mean_terminated_length": 190.25,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.24852854758501053,
      "epoch": 0.0966651227420102,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06248871237039566,
      "kl": 0.002544258371926844,
      "learning_rate": 9.806762389995368e-07,
      "loss": 0.0221,
      "num_tokens": 57342507.0,
      "reward": 0.9785215258598328,
      "reward_std": 0.08591390401124954,
      "rewards/reward_func/mean": 0.9785215258598328,
      "rewards/reward_func/std": 0.08591391146183014,
      "step": 2087,
      "step_time": 20.447678916156292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 178.5,
      "completions/mean_terminated_length": 178.5,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.19578471034765244,
      "epoch": 0.09671144048170449,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08643905818462372,
      "kl": 0.0011518751416588202,
      "learning_rate": 9.806669754515979e-07,
      "loss": -0.0566,
      "num_tokens": 57372995.0,
      "reward": 0.5361264944076538,
      "reward_std": 0.09083743393421173,
      "rewards/reward_func/mean": 0.5361264944076538,
      "rewards/reward_func/std": 0.09083743393421173,
      "step": 2088,
      "step_time": 22.06103541329503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 166.5625,
      "completions/mean_terminated_length": 166.5625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.20414895564317703,
      "epoch": 0.0967577582213988,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13867773115634918,
      "kl": 0.0043305178405717015,
      "learning_rate": 9.80657711903659e-07,
      "loss": 0.0415,
      "num_tokens": 57397628.0,
      "reward": 0.9576616287231445,
      "reward_std": 0.049273598939180374,
      "rewards/reward_func/mean": 0.9576616287231445,
      "rewards/reward_func/std": 0.049273598939180374,
      "step": 2089,
      "step_time": 19.180107697844505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 214.125,
      "completions/mean_terminated_length": 214.125,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.4542301818728447,
      "epoch": 0.0968040759610931,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09310434758663177,
      "kl": 0.0026618808042258024,
      "learning_rate": 9.806484483557203e-07,
      "loss": 0.0406,
      "num_tokens": 57430206.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 2090,
      "step_time": 25.198974158614874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 133.1875,
      "completions/mean_terminated_length": 133.1875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2857926934957504,
      "epoch": 0.09685039370078741,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002290284726768732,
      "kl": 0.0015625466767232865,
      "learning_rate": 9.806391848077813e-07,
      "loss": 0.0001,
      "num_tokens": 57465521.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2091,
      "step_time": 18.497051488608122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 165.75,
      "completions/mean_terminated_length": 165.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2870178297162056,
      "epoch": 0.0968967114404817,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0984499603509903,
      "kl": 0.001998687395825982,
      "learning_rate": 9.806299212598424e-07,
      "loss": -0.0126,
      "num_tokens": 57486301.0,
      "reward": 0.8423806428909302,
      "reward_std": 0.09095054119825363,
      "rewards/reward_func/mean": 0.8423806428909302,
      "rewards/reward_func/std": 0.09095054864883423,
      "step": 2092,
      "step_time": 16.793691530823708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 185.0625,
      "completions/mean_terminated_length": 185.0625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.16687441617250443,
      "epoch": 0.09694302918017601,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001002144650556147,
      "kl": 0.000947712775086984,
      "learning_rate": 9.806206577119035e-07,
      "loss": 0.0,
      "num_tokens": 57508446.0,
      "reward": 0.9091564416885376,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9091564416885376,
      "rewards/reward_func/std": 0.0,
      "step": 2093,
      "step_time": 18.088635966181755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 166.9375,
      "completions/mean_terminated_length": 166.9375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.338656447827816,
      "epoch": 0.09698934691987031,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003697863081470132,
      "kl": 0.0021888002811465412,
      "learning_rate": 9.806113941639648e-07,
      "loss": 0.0001,
      "num_tokens": 57530093.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2094,
      "step_time": 18.40073474869132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 158.125,
      "completions/mean_terminated_length": 158.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.373967744410038,
      "epoch": 0.09703566465956462,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010357302380725741,
      "kl": 0.001299672672757879,
      "learning_rate": 9.80602130616026e-07,
      "loss": 0.0001,
      "num_tokens": 57567359.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2095,
      "step_time": 19.8071150444448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 157.6875,
      "completions/mean_terminated_length": 157.6875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.37355881184339523,
      "epoch": 0.09708198239925891,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012508081272244453,
      "kl": 0.0016345091571565717,
      "learning_rate": 9.80592867068087e-07,
      "loss": 0.0001,
      "num_tokens": 57601226.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2096,
      "step_time": 19.480534825474024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.21006585657596588,
      "epoch": 0.09712830013895322,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028879523742944,
      "kl": 0.0015600161132169887,
      "learning_rate": 9.805836035201482e-07,
      "loss": 0.0001,
      "num_tokens": 57622353.0,
      "reward": 0.39511775970458984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.39511775970458984,
      "rewards/reward_func/std": 0.0,
      "step": 2097,
      "step_time": 17.613510336726904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 195.6875,
      "completions/mean_terminated_length": 195.6875,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.4354572370648384,
      "epoch": 0.09717461787864752,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025400533340871334,
      "kl": 0.0021850342745892704,
      "learning_rate": 9.805743399722093e-07,
      "loss": 0.0001,
      "num_tokens": 57645788.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2098,
      "step_time": 20.207767341285944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 180.8125,
      "completions/mean_terminated_length": 180.8125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.21231921389698982,
      "epoch": 0.09722093561834183,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016797012649476528,
      "kl": 0.0012572874256875366,
      "learning_rate": 9.805650764242705e-07,
      "loss": 0.0001,
      "num_tokens": 57675609.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2099,
      "step_time": 19.69765767455101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 213.625,
      "completions/mean_terminated_length": 213.625,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.48515453189611435,
      "epoch": 0.09726725335803613,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002514895284548402,
      "kl": 0.002511514292564243,
      "learning_rate": 9.805558128763316e-07,
      "loss": 0.0001,
      "num_tokens": 57704979.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2100,
      "step_time": 23.956448070704937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 180.75,
      "completions/mean_terminated_length": 180.75,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.2419266775250435,
      "epoch": 0.09731357109773044,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002808332908898592,
      "kl": 0.0017441484960727394,
      "learning_rate": 9.805465493283927e-07,
      "loss": 0.0001,
      "num_tokens": 57727503.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 2101,
      "step_time": 18.251241214573383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 115.875,
      "completions/mean_terminated_length": 115.875,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2840619161725044,
      "epoch": 0.09735988883742473,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017955612856894732,
      "kl": 0.0016842714103404433,
      "learning_rate": 9.805372857804538e-07,
      "loss": 0.0001,
      "num_tokens": 57750957.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2102,
      "step_time": 13.67229737713933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 138.25,
      "completions/mean_terminated_length": 138.25,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.32540467381477356,
      "epoch": 0.09740620657711904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001574921770952642,
      "kl": 0.0017012866446748376,
      "learning_rate": 9.80528022232515e-07,
      "loss": 0.0001,
      "num_tokens": 57787057.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2103,
      "step_time": 18.993708673864603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 140.6875,
      "completions/mean_terminated_length": 140.6875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2592414878308773,
      "epoch": 0.09745252431681334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021899163257330656,
      "kl": 0.0015125124482437968,
      "learning_rate": 9.80518758684576e-07,
      "loss": 0.0001,
      "num_tokens": 57809964.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2104,
      "step_time": 15.311320420354605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 141.4375,
      "completions/mean_terminated_length": 141.4375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.23829853907227516,
      "epoch": 0.09749884205650765,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032185029704123735,
      "kl": 0.0018298894865438342,
      "learning_rate": 9.805094951366372e-07,
      "loss": 0.0001,
      "num_tokens": 57829875.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2105,
      "step_time": 15.38297138735652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 122.75,
      "completions/mean_terminated_length": 122.75,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2016875445842743,
      "epoch": 0.09754515979620194,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001584141282364726,
      "kl": 0.0011248220544075593,
      "learning_rate": 9.805002315886983e-07,
      "loss": 0.0001,
      "num_tokens": 57849327.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2106,
      "step_time": 13.799412783235312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 231.5625,
      "completions/mean_terminated_length": 231.5625,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.37778548151254654,
      "epoch": 0.09759147753589625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09862415492534637,
      "kl": 0.002680773555766791,
      "learning_rate": 9.804909680407597e-07,
      "loss": -0.0603,
      "num_tokens": 57884040.0,
      "reward": 0.0003452743694651872,
      "reward_std": 0.00024041820142883807,
      "rewards/reward_func/mean": 0.0003452743694651872,
      "rewards/reward_func/std": 0.0002404182159807533,
      "step": 2107,
      "step_time": 30.234281316399574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 112.5,
      "completions/mean_terminated_length": 112.5,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.2959787994623184,
      "epoch": 0.09763779527559055,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025102982763201,
      "kl": 0.0020582875586114824,
      "learning_rate": 9.804817044928208e-07,
      "loss": 0.0001,
      "num_tokens": 57907264.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2108,
      "step_time": 13.531257309019566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 154.6875,
      "completions/mean_terminated_length": 154.6875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.33832044154405594,
      "epoch": 0.09768411301528486,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001770894741639495,
      "kl": 0.001571035390952602,
      "learning_rate": 9.804724409448817e-07,
      "loss": 0.0001,
      "num_tokens": 57933659.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2109,
      "step_time": 17.286038760095835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.2194627821445465,
      "epoch": 0.09773043075497916,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010604043491184711,
      "kl": 0.0009090398234548047,
      "learning_rate": 9.80463177396943e-07,
      "loss": 0.0,
      "num_tokens": 57986179.0,
      "reward": 0.17782793939113617,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.17782793939113617,
      "rewards/reward_func/std": 0.0,
      "step": 2110,
      "step_time": 26.853449895977974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 118.9375,
      "completions/mean_terminated_length": 118.9375,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.3218560069799423,
      "epoch": 0.09777674849467347,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017308102687820792,
      "kl": 0.0015206353273242712,
      "learning_rate": 9.804539138490042e-07,
      "loss": 0.0001,
      "num_tokens": 58010162.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2111,
      "step_time": 15.71082091704011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 127.25,
      "completions/mean_terminated_length": 127.25,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.36409657448530197,
      "epoch": 0.09782306623436776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018410587217658758,
      "kl": 0.001968503464013338,
      "learning_rate": 9.804446503010653e-07,
      "loss": 0.0001,
      "num_tokens": 58038038.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2112,
      "step_time": 15.209703668951988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 150.3125,
      "completions/mean_terminated_length": 150.3125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.20162580534815788,
      "epoch": 0.09786938397406207,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001409210846759379,
      "kl": 0.0011586171895032749,
      "learning_rate": 9.804353867531264e-07,
      "loss": 0.0001,
      "num_tokens": 58063291.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 2113,
      "step_time": 17.23585532233119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 138.5,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.31495241820812225,
      "epoch": 0.09791570171375637,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013494627783074975,
      "kl": 0.001211692811921239,
      "learning_rate": 9.804261232051876e-07,
      "loss": 0.0001,
      "num_tokens": 58092227.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2114,
      "step_time": 16.41570270061493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 154.5625,
      "completions/mean_terminated_length": 154.5625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.375872403383255,
      "epoch": 0.09796201945345068,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012647317489609122,
      "kl": 0.0015600319602526724,
      "learning_rate": 9.804168596572487e-07,
      "loss": 0.0001,
      "num_tokens": 58128988.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2115,
      "step_time": 20.176334507763386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 334.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 239.8125,
      "completions/mean_terminated_length": 239.8125,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.2824728675186634,
      "epoch": 0.09800833719314497,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08006158471107483,
      "kl": 0.002352870360482484,
      "learning_rate": 9.804075961093098e-07,
      "loss": -0.0952,
      "num_tokens": 58163705.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 2116,
      "step_time": 30.311158139258623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 194.125,
      "completions/mean_terminated_length": 194.125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4082041159272194,
      "epoch": 0.09805465493283928,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004847372882068157,
      "kl": 0.003344768425449729,
      "learning_rate": 9.80398332561371e-07,
      "loss": 0.0002,
      "num_tokens": 58185563.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2117,
      "step_time": 18.672387160360813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 152.875,
      "completions/mean_terminated_length": 152.875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4239012971520424,
      "epoch": 0.09810097267253358,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015941817546263337,
      "kl": 0.0015966749342624098,
      "learning_rate": 9.80389069013432e-07,
      "loss": 0.0001,
      "num_tokens": 58207097.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2118,
      "step_time": 17.626061864197254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 190.0,
      "completions/mean_terminated_length": 190.0,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.26102016121149063,
      "epoch": 0.09814729041222789,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017980575794354081,
      "kl": 0.0013494390004780143,
      "learning_rate": 9.803798054654932e-07,
      "loss": 0.0001,
      "num_tokens": 58229593.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2119,
      "step_time": 18.867047514766455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 112.875,
      "completions/mean_terminated_length": 112.875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.3516552895307541,
      "epoch": 0.09819360815192218,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025518988259136677,
      "kl": 0.0019067800021730363,
      "learning_rate": 9.803705419175545e-07,
      "loss": 0.0001,
      "num_tokens": 58255303.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2120,
      "step_time": 14.232797224074602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 118.25,
      "completions/mean_terminated_length": 118.25,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.23819386214017868,
      "epoch": 0.0982399258916165,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013898770557716489,
      "kl": 0.0013417064037639648,
      "learning_rate": 9.803612783696154e-07,
      "loss": 0.0001,
      "num_tokens": 58274619.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2121,
      "step_time": 13.692626401782036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 175.1875,
      "completions/mean_terminated_length": 175.1875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.3593531921505928,
      "epoch": 0.09828624363131079,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12910568714141846,
      "kl": 0.0021692859299946576,
      "learning_rate": 9.803520148216766e-07,
      "loss": 0.0109,
      "num_tokens": 58296686.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2122,
      "step_time": 19.37053521350026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 176.0625,
      "completions/mean_terminated_length": 176.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.38618531078100204,
      "epoch": 0.0983325613710051,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002144093392416835,
      "kl": 0.002112426533130929,
      "learning_rate": 9.803427512737377e-07,
      "loss": 0.0001,
      "num_tokens": 58318975.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2123,
      "step_time": 20.69658763706684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 175.75,
      "completions/mean_terminated_length": 175.75,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3645559474825859,
      "epoch": 0.0983788791106994,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1052614226937294,
      "kl": 0.002696045790798962,
      "learning_rate": 9.80333487725799e-07,
      "loss": -0.0076,
      "num_tokens": 58344139.0,
      "reward": 0.1696605384349823,
      "reward_std": 0.36474987864494324,
      "rewards/reward_func/mean": 0.1696605384349823,
      "rewards/reward_func/std": 0.36474987864494324,
      "step": 2124,
      "step_time": 18.370091810822487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 205.5,
      "completions/mean_terminated_length": 205.5,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.35013314336538315,
      "epoch": 0.0984251968503937,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16796258091926575,
      "kl": 0.0027245827950537205,
      "learning_rate": 9.803242241778601e-07,
      "loss": -0.0635,
      "num_tokens": 58377411.0,
      "reward": 0.2525527775287628,
      "reward_std": 0.28331121802330017,
      "rewards/reward_func/mean": 0.2525527775287628,
      "rewards/reward_func/std": 0.28331121802330017,
      "step": 2125,
      "step_time": 24.360945247113705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 167.4375,
      "completions/mean_terminated_length": 167.4375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.39234574884176254,
      "epoch": 0.098471514590088,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012811715714633465,
      "kl": 0.0018412568606436253,
      "learning_rate": 9.803149606299213e-07,
      "loss": 0.0001,
      "num_tokens": 58427242.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2126,
      "step_time": 24.655378818511963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 217.25,
      "completions/mean_terminated_length": 217.25,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.3896579220890999,
      "epoch": 0.09851783232978231,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003342331387102604,
      "kl": 0.002888473973143846,
      "learning_rate": 9.803056970819824e-07,
      "loss": 0.0001,
      "num_tokens": 58465166.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2127,
      "step_time": 24.926264192909002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 176.0625,
      "completions/mean_terminated_length": 176.0625,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.36444825679063797,
      "epoch": 0.09856415006947661,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015871605137363076,
      "kl": 0.001471631578169763,
      "learning_rate": 9.802964335340435e-07,
      "loss": 0.0001,
      "num_tokens": 58491663.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2128,
      "step_time": 18.645525492727757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 151.8125,
      "completions/mean_terminated_length": 151.8125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.40546315163373947,
      "epoch": 0.09861046780917092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014846234116703272,
      "kl": 0.0017653919057920575,
      "learning_rate": 9.802871699861046e-07,
      "loss": 0.0001,
      "num_tokens": 58532572.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2129,
      "step_time": 20.880179658532143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 119.5,
      "completions/mean_terminated_length": 119.5,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.2928945794701576,
      "epoch": 0.09865678554886521,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001732602366246283,
      "kl": 0.001616789581021294,
      "learning_rate": 9.802779064381658e-07,
      "loss": 0.0001,
      "num_tokens": 58553332.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2130,
      "step_time": 14.878775801509619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 197.0,
      "completions/mean_terminated_length": 197.0,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.3602956533432007,
      "epoch": 0.09870310328855952,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027457152027636766,
      "kl": 0.0020547359599731863,
      "learning_rate": 9.802686428902269e-07,
      "loss": 0.0001,
      "num_tokens": 58575828.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2131,
      "step_time": 20.551716335117817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.34010016918182373,
      "epoch": 0.09874942102825382,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005504888948053122,
      "kl": 0.004122120910324156,
      "learning_rate": 9.80259379342288e-07,
      "loss": 0.0002,
      "num_tokens": 58598737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2132,
      "step_time": 20.68515168502927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 149.0,
      "completions/mean_terminated_length": 149.0,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3857610821723938,
      "epoch": 0.09879573876794813,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002364929998293519,
      "kl": 0.002093592134770006,
      "learning_rate": 9.802501157943491e-07,
      "loss": 0.0001,
      "num_tokens": 58622017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2133,
      "step_time": 16.79001769796014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 386.0,
      "completions/max_terminated_length": 386.0,
      "completions/mean_length": 286.625,
      "completions/mean_terminated_length": 286.625,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.3068154603242874,
      "epoch": 0.09884205650764243,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07661484181880951,
      "kl": 0.0024499078281223774,
      "learning_rate": 9.802408522464103e-07,
      "loss": -0.1667,
      "num_tokens": 58648043.0,
      "reward": 0.554679274559021,
      "reward_std": 0.4505092203617096,
      "rewards/reward_func/mean": 0.554679274559021,
      "rewards/reward_func/std": 0.4505092203617096,
      "step": 2134,
      "step_time": 31.21722112223506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 129.875,
      "completions/mean_terminated_length": 129.875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.28052544593811035,
      "epoch": 0.09888837424733674,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020063433330506086,
      "kl": 0.0015915047551970929,
      "learning_rate": 9.802315886984714e-07,
      "loss": 0.0001,
      "num_tokens": 58669193.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2135,
      "step_time": 13.941501632332802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 167.125,
      "completions/mean_terminated_length": 167.125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.39567285031080246,
      "epoch": 0.09893469198703103,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001788937021046877,
      "kl": 0.001613630069186911,
      "learning_rate": 9.802223251505325e-07,
      "loss": 0.0001,
      "num_tokens": 58705195.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2136,
      "step_time": 20.20920692011714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 130.3125,
      "completions/mean_terminated_length": 130.3125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.23943090438842773,
      "epoch": 0.09898100972672534,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002178144408389926,
      "kl": 0.0015545105270575732,
      "learning_rate": 9.802130616025939e-07,
      "loss": 0.0001,
      "num_tokens": 58724768.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2137,
      "step_time": 15.008190114051104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 160.4375,
      "completions/mean_terminated_length": 160.4375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.26413238793611526,
      "epoch": 0.09902732746641964,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002063291845843196,
      "kl": 0.0013851181138306856,
      "learning_rate": 9.80203798054655e-07,
      "loss": 0.0001,
      "num_tokens": 58747271.0,
      "reward": 0.43171051144599915,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.43171051144599915,
      "rewards/reward_func/std": 0.0,
      "step": 2138,
      "step_time": 16.399781592190266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.230569027364254,
      "epoch": 0.09907364520611395,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13090035319328308,
      "kl": 0.002730335865635425,
      "learning_rate": 9.80194534506716e-07,
      "loss": 0.0891,
      "num_tokens": 58768757.0,
      "reward": 0.4019976258277893,
      "reward_std": 0.1569238007068634,
      "rewards/reward_func/mean": 0.4019976258277893,
      "rewards/reward_func/std": 0.1569238007068634,
      "step": 2139,
      "step_time": 19.543237898498774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 143.9375,
      "completions/mean_terminated_length": 143.9375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3291706144809723,
      "epoch": 0.09911996294580824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019901259802281857,
      "kl": 0.0017054046329576522,
      "learning_rate": 9.801852709587772e-07,
      "loss": 0.0001,
      "num_tokens": 58789108.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2140,
      "step_time": 14.700423073023558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 313.0,
      "completions/max_terminated_length": 313.0,
      "completions/mean_length": 207.75,
      "completions/mean_terminated_length": 207.75,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.41266487538814545,
      "epoch": 0.09916628068550255,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006549373734742403,
      "kl": 0.0029919970547780395,
      "learning_rate": 9.801760074108383e-07,
      "loss": 0.0002,
      "num_tokens": 58813376.0,
      "reward": 4.5816336835535765e-11,
      "reward_std": 1.2519406344946304e-10,
      "rewards/reward_func/mean": 4.5816336835535765e-11,
      "rewards/reward_func/std": 1.2519406344946304e-10,
      "step": 2141,
      "step_time": 26.165336951613426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 168.5625,
      "completions/mean_terminated_length": 168.5625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.17884428054094315,
      "epoch": 0.09921259842519685,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014924416318535805,
      "kl": 0.0013298338162712753,
      "learning_rate": 9.801667438628995e-07,
      "loss": 0.0001,
      "num_tokens": 58850441.0,
      "reward": 0.7093939781188965,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7093939781188965,
      "rewards/reward_func/std": 0.0,
      "step": 2142,
      "step_time": 20.494751326739788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 137.4375,
      "completions/mean_terminated_length": 137.4375,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.32688628137111664,
      "epoch": 0.09925891616489116,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018485600594431162,
      "kl": 0.0015059587894938886,
      "learning_rate": 9.801574803149606e-07,
      "loss": 0.0001,
      "num_tokens": 58873120.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2143,
      "step_time": 16.893532820045948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.5,
      "completions/mean_terminated_length": 132.5,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2655008025467396,
      "epoch": 0.09930523390458545,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015479301800951362,
      "kl": 0.0011959699622821063,
      "learning_rate": 9.801482167670217e-07,
      "loss": 0.0001,
      "num_tokens": 58895320.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2144,
      "step_time": 14.535428643226624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 201.8125,
      "completions/mean_terminated_length": 201.8125,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.419305719435215,
      "epoch": 0.09935155164427976,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10994252562522888,
      "kl": 0.0047153522609733045,
      "learning_rate": 9.801389532190828e-07,
      "loss": -0.0247,
      "num_tokens": 58920725.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 2145,
      "step_time": 21.881717685610056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 141.4375,
      "completions/mean_terminated_length": 141.4375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.21117480844259262,
      "epoch": 0.09939786938397406,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002725240308791399,
      "kl": 0.0016288287588395178,
      "learning_rate": 9.80129689671144e-07,
      "loss": 0.0001,
      "num_tokens": 58943564.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2146,
      "step_time": 16.320057708770037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 123.5625,
      "completions/mean_terminated_length": 123.5625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.29132601618766785,
      "epoch": 0.09944418712366837,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0051228683441877365,
      "kl": 0.001822701218770817,
      "learning_rate": 9.80120426123205e-07,
      "loss": 0.0001,
      "num_tokens": 58969445.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2147,
      "step_time": 15.702005561441183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 145.6875,
      "completions/mean_terminated_length": 145.6875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3887370899319649,
      "epoch": 0.09949050486336267,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014260419411584735,
      "kl": 0.0014701895706821233,
      "learning_rate": 9.801111625752662e-07,
      "loss": 0.0001,
      "num_tokens": 59001024.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2148,
      "step_time": 17.672618754208088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 200.4375,
      "completions/mean_terminated_length": 200.4375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.21593355014920235,
      "epoch": 0.09953682260305698,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1311170905828476,
      "kl": 0.006177579518407583,
      "learning_rate": 9.801018990273273e-07,
      "loss": -0.1122,
      "num_tokens": 59028871.0,
      "reward": 0.7609108090400696,
      "reward_std": 0.24304427206516266,
      "rewards/reward_func/mean": 0.7609108090400696,
      "rewards/reward_func/std": 0.24304430186748505,
      "step": 2149,
      "step_time": 22.516092840582132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 125.625,
      "completions/mean_terminated_length": 125.625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3041015565395355,
      "epoch": 0.09958314034275127,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008619067259132862,
      "kl": 0.0031587775447405875,
      "learning_rate": 9.800926354793887e-07,
      "loss": 0.0002,
      "num_tokens": 59050113.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2150,
      "step_time": 14.643218837678432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 178.4375,
      "completions/mean_terminated_length": 178.4375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.22522944957017899,
      "epoch": 0.09962945808244558,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015865567838773131,
      "kl": 0.0014014854386914521,
      "learning_rate": 9.800833719314498e-07,
      "loss": 0.0001,
      "num_tokens": 59072808.0,
      "reward": 0.4274149239063263,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4274149239063263,
      "rewards/reward_func/std": 0.0,
      "step": 2151,
      "step_time": 17.842246294021606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 194.875,
      "completions/mean_terminated_length": 194.875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3555251806974411,
      "epoch": 0.09967577582213988,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021308145951479673,
      "kl": 0.0017405890685040504,
      "learning_rate": 9.800741083835107e-07,
      "loss": 0.0001,
      "num_tokens": 59112998.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2152,
      "step_time": 24.436119481921196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 145.0625,
      "completions/mean_terminated_length": 145.0625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.4170515313744545,
      "epoch": 0.09972209356183419,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023248870857059956,
      "kl": 0.0018290946027264,
      "learning_rate": 9.800648448355718e-07,
      "loss": 0.0001,
      "num_tokens": 59146151.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2153,
      "step_time": 18.82840597257018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 135.9375,
      "completions/mean_terminated_length": 135.9375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.15384923294186592,
      "epoch": 0.09976841130152848,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012263598619028926,
      "kl": 0.0007851967820897698,
      "learning_rate": 9.800555812876332e-07,
      "loss": 0.0,
      "num_tokens": 59180150.0,
      "reward": 0.3678794503211975,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3678794503211975,
      "rewards/reward_func/std": 0.0,
      "step": 2154,
      "step_time": 17.61545640602708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 177.75,
      "completions/mean_terminated_length": 177.75,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.33582911640405655,
      "epoch": 0.0998147290412228,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4987824261188507,
      "kl": 0.021140100667253137,
      "learning_rate": 9.800463177396943e-07,
      "loss": 0.0232,
      "num_tokens": 59203602.0,
      "reward": 0.5846918821334839,
      "reward_std": 0.46775349974632263,
      "rewards/reward_func/mean": 0.5846918821334839,
      "rewards/reward_func/std": 0.46775349974632263,
      "step": 2155,
      "step_time": 18.694602459669113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 194.625,
      "completions/mean_terminated_length": 194.625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.28729238361120224,
      "epoch": 0.09986104678091709,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002293812809512019,
      "kl": 0.0019049131078645587,
      "learning_rate": 9.800370541917554e-07,
      "loss": 0.0001,
      "num_tokens": 59224876.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2156,
      "step_time": 20.50570261478424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 196.6875,
      "completions/mean_terminated_length": 196.6875,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.19378679618239403,
      "epoch": 0.0999073645206114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017660867888480425,
      "kl": 0.0011897815275005996,
      "learning_rate": 9.800277906438166e-07,
      "loss": 0.0001,
      "num_tokens": 59248519.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2157,
      "step_time": 19.057425145059824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 113.0,
      "completions/max_terminated_length": 113.0,
      "completions/mean_length": 98.5,
      "completions/mean_terminated_length": 98.5,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "entropy": 0.2795220613479614,
      "epoch": 0.0999536822603057,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019364472245797515,
      "kl": 0.0016187586006708443,
      "learning_rate": 9.800185270958777e-07,
      "loss": 0.0001,
      "num_tokens": 59268815.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2158,
      "step_time": 11.523467421531677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 426.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 360.75,
      "completions/mean_terminated_length": 360.75,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "entropy": 0.19120676815509796,
      "epoch": 0.1,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012357404921203852,
      "kl": 0.0008733440190553665,
      "learning_rate": 9.800092635479388e-07,
      "loss": 0.0,
      "num_tokens": 59297211.0,
      "reward": 0.9834010601043701,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9834010601043701,
      "rewards/reward_func/std": 0.0,
      "step": 2159,
      "step_time": 34.786151614040136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 165.6875,
      "completions/mean_terminated_length": 165.6875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.33776474744081497,
      "epoch": 0.1000463177396943,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00268212310038507,
      "kl": 0.0019901118939742446,
      "learning_rate": 9.8e-07,
      "loss": 0.0001,
      "num_tokens": 59317766.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2160,
      "step_time": 16.260948821902275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 169.25,
      "completions/mean_terminated_length": 169.25,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3459314554929733,
      "epoch": 0.10009263547938861,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001920197973959148,
      "kl": 0.0013784721959382296,
      "learning_rate": 9.79990736452061e-07,
      "loss": 0.0001,
      "num_tokens": 59345258.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2161,
      "step_time": 18.700891856104136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 283.5,
      "completions/mean_terminated_length": 283.5,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.4073880463838577,
      "epoch": 0.10013895321908291,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06136735528707504,
      "kl": 0.0022889088722877204,
      "learning_rate": 9.799814729041222e-07,
      "loss": 0.1389,
      "num_tokens": 59374130.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 2162,
      "step_time": 36.16574815660715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 163.0625,
      "completions/mean_terminated_length": 163.0625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.35263369232416153,
      "epoch": 0.10018527095877722,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003628405509516597,
      "kl": 0.002199364302214235,
      "learning_rate": 9.799722093561833e-07,
      "loss": 0.0001,
      "num_tokens": 59394419.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2163,
      "step_time": 19.95395029336214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 222.625,
      "completions/mean_terminated_length": 222.625,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.44233647733926773,
      "epoch": 0.10023158869847151,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004055425524711609,
      "kl": 0.0034508295357227325,
      "learning_rate": 9.799629458082446e-07,
      "loss": 0.0002,
      "num_tokens": 59428541.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2164,
      "step_time": 26.92737577110529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 188.375,
      "completions/mean_terminated_length": 188.375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3089473396539688,
      "epoch": 0.10027790643816582,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0905863493680954,
      "kl": 0.0030467250617220998,
      "learning_rate": 9.799536822603056e-07,
      "loss": 0.1127,
      "num_tokens": 59453907.0,
      "reward": 0.2638552188873291,
      "reward_std": 0.21108420193195343,
      "rewards/reward_func/mean": 0.2638552188873291,
      "rewards/reward_func/std": 0.21108418703079224,
      "step": 2165,
      "step_time": 21.577270343899727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 169.5625,
      "completions/mean_terminated_length": 169.5625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.38580887019634247,
      "epoch": 0.10032422417786012,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025594322942197323,
      "kl": 0.0018513394461479038,
      "learning_rate": 9.799444187123667e-07,
      "loss": 0.0001,
      "num_tokens": 59485260.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2166,
      "step_time": 22.639272842556238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 159.9375,
      "completions/mean_terminated_length": 159.9375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.1861037164926529,
      "epoch": 0.10037054191755443,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021256774198263884,
      "kl": 0.0013731769286096096,
      "learning_rate": 9.79935155164428e-07,
      "loss": 0.0001,
      "num_tokens": 59505931.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 2167,
      "step_time": 18.174105010926723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 179.1875,
      "completions/mean_terminated_length": 179.1875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.4112344831228256,
      "epoch": 0.10041685965724872,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011156733147799969,
      "kl": 0.005695650819689035,
      "learning_rate": 9.799258916164891e-07,
      "loss": 0.0003,
      "num_tokens": 59543166.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2168,
      "step_time": 23.43252919241786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 166.5,
      "completions/mean_terminated_length": 166.5,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.317965492606163,
      "epoch": 0.10046317739694303,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10522262752056122,
      "kl": 0.0018635388696566224,
      "learning_rate": 9.799166280685503e-07,
      "loss": -0.035,
      "num_tokens": 59570502.0,
      "reward": 0.6956884860992432,
      "reward_std": 0.41583725810050964,
      "rewards/reward_func/mean": 0.6956884860992432,
      "rewards/reward_func/std": 0.41583725810050964,
      "step": 2169,
      "step_time": 20.067191254347563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 134.375,
      "completions/mean_terminated_length": 134.375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2777117192745209,
      "epoch": 0.10050949513663733,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017494413768872619,
      "kl": 0.0015478904824703932,
      "learning_rate": 9.799073645206114e-07,
      "loss": 0.0001,
      "num_tokens": 59599740.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2170,
      "step_time": 16.83437930420041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 378.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 321.625,
      "completions/mean_terminated_length": 321.625,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "entropy": 0.23447756096720695,
      "epoch": 0.10055581287633164,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06996208429336548,
      "kl": 0.002022105356445536,
      "learning_rate": 9.798981009726725e-07,
      "loss": -0.0823,
      "num_tokens": 59641062.0,
      "reward": 0.6876200437545776,
      "reward_std": 0.2851105332374573,
      "rewards/reward_func/mean": 0.6876200437545776,
      "rewards/reward_func/std": 0.2851105332374573,
      "step": 2171,
      "step_time": 34.55266473069787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 128.875,
      "completions/mean_terminated_length": 128.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2668868377804756,
      "epoch": 0.10060213061602594,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011811316944658756,
      "kl": 0.0010578126821201295,
      "learning_rate": 9.798888374247336e-07,
      "loss": 0.0001,
      "num_tokens": 59662116.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2172,
      "step_time": 14.070418328046799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.39442313462495804,
      "epoch": 0.10064844835572025,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001506642671301961,
      "kl": 0.0018411995843052864,
      "learning_rate": 9.798795738767948e-07,
      "loss": 0.0001,
      "num_tokens": 59697579.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2173,
      "step_time": 20.27152444422245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 157.5,
      "completions/mean_terminated_length": 157.5,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3756882771849632,
      "epoch": 0.10069476609541454,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01110369898378849,
      "kl": 0.003480234125163406,
      "learning_rate": 9.79870310328856e-07,
      "loss": 0.0002,
      "num_tokens": 59751123.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2174,
      "step_time": 25.346865888684988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 194.75,
      "completions/mean_terminated_length": 194.75,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.23016562312841415,
      "epoch": 0.10074108383510885,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10656745731830597,
      "kl": 0.002969473891425878,
      "learning_rate": 9.79861046780917e-07,
      "loss": -0.0385,
      "num_tokens": 59772895.0,
      "reward": 0.9022549390792847,
      "reward_std": 0.2546529173851013,
      "rewards/reward_func/mean": 0.9022549390792847,
      "rewards/reward_func/std": 0.2546529173851013,
      "step": 2175,
      "step_time": 18.87780975922942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 195.4375,
      "completions/mean_terminated_length": 195.4375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.19872033596038818,
      "epoch": 0.10078740157480315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005258552264422178,
      "kl": 0.003601659002015367,
      "learning_rate": 9.798517832329781e-07,
      "loss": 0.0002,
      "num_tokens": 59797558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2176,
      "step_time": 19.6141157746315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.32640138268470764,
      "epoch": 0.10083371931449746,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005216421093791723,
      "kl": 0.0028714233194477856,
      "learning_rate": 9.798425196850393e-07,
      "loss": 0.0001,
      "num_tokens": 59818170.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2177,
      "step_time": 16.826649986207485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 120.25,
      "completions/mean_terminated_length": 120.25,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.28045129776000977,
      "epoch": 0.10088003705419175,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001770458067767322,
      "kl": 0.0015245918766595423,
      "learning_rate": 9.798332561371004e-07,
      "loss": 0.0001,
      "num_tokens": 59839822.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2178,
      "step_time": 14.084755450487137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 141.5,
      "completions/mean_terminated_length": 141.5,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3563677594065666,
      "epoch": 0.10092635479388606,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001462252694182098,
      "kl": 0.0015968038060236722,
      "learning_rate": 9.798239925891615e-07,
      "loss": 0.0001,
      "num_tokens": 59875830.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2179,
      "step_time": 18.74991489201784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 140.5,
      "completions/mean_terminated_length": 140.5,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.22664383053779602,
      "epoch": 0.10097267253358036,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006523195654153824,
      "kl": 0.0024224047956522554,
      "learning_rate": 9.798147290412229e-07,
      "loss": 0.0001,
      "num_tokens": 59895566.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2180,
      "step_time": 15.39057507738471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 136.6875,
      "completions/mean_terminated_length": 136.6875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.4017426148056984,
      "epoch": 0.10101899027327467,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003585459664463997,
      "kl": 0.002988903783261776,
      "learning_rate": 9.79805465493284e-07,
      "loss": 0.0001,
      "num_tokens": 59948393.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2181,
      "step_time": 23.900904923677444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 170.625,
      "completions/mean_terminated_length": 170.625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.4459299221634865,
      "epoch": 0.10106530801296897,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018644469091668725,
      "kl": 0.001935254200361669,
      "learning_rate": 9.79796201945345e-07,
      "loss": 0.0001,
      "num_tokens": 59986643.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2182,
      "step_time": 23.580065827816725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 183.625,
      "completions/mean_terminated_length": 183.625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3162192851305008,
      "epoch": 0.10111162575266328,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09607032686471939,
      "kl": 0.002582930203061551,
      "learning_rate": 9.79786938397406e-07,
      "loss": -0.0505,
      "num_tokens": 60009949.0,
      "reward": 0.7515207529067993,
      "reward_std": 0.37357062101364136,
      "rewards/reward_func/mean": 0.7515207529067993,
      "rewards/reward_func/std": 0.37357065081596375,
      "step": 2183,
      "step_time": 19.198546521365643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 250.5,
      "completions/mean_terminated_length": 250.5,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "entropy": 0.3329693451523781,
      "epoch": 0.10115794349235757,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10372719913721085,
      "kl": 0.0024032650981098413,
      "learning_rate": 9.797776748494674e-07,
      "loss": -0.0148,
      "num_tokens": 60044453.0,
      "reward": 0.9002240896224976,
      "reward_std": 0.05049748346209526,
      "rewards/reward_func/mean": 0.9002240896224976,
      "rewards/reward_func/std": 0.050497494637966156,
      "step": 2184,
      "step_time": 28.069841776043177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 121.5,
      "completions/mean_terminated_length": 121.5,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2763189375400543,
      "epoch": 0.10120426123205188,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017113815993070602,
      "kl": 0.0017008509603329003,
      "learning_rate": 9.797684113015285e-07,
      "loss": 0.0001,
      "num_tokens": 60064269.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2185,
      "step_time": 13.093166135251522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.29085466265678406,
      "epoch": 0.10125057897174618,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00194756337441504,
      "kl": 0.0017149012710433453,
      "learning_rate": 9.797591477535896e-07,
      "loss": 0.0001,
      "num_tokens": 60085303.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2186,
      "step_time": 14.907429192215204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 139.875,
      "completions/mean_terminated_length": 139.875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.28259269148111343,
      "epoch": 0.10129689671144049,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034371220972388983,
      "kl": 0.0021036481484770775,
      "learning_rate": 9.797498842056507e-07,
      "loss": 0.0001,
      "num_tokens": 60110469.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2187,
      "step_time": 15.739911269396544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 172.0,
      "completions/mean_terminated_length": 172.0,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.42096367478370667,
      "epoch": 0.10134321445113478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002669628243893385,
      "kl": 0.002495112828910351,
      "learning_rate": 9.797406206577119e-07,
      "loss": 0.0001,
      "num_tokens": 60162709.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2188,
      "step_time": 25.90372997522354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 184.125,
      "completions/mean_terminated_length": 184.125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.20531976968050003,
      "epoch": 0.1013895321908291,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14281108975410461,
      "kl": 0.002182638010708615,
      "learning_rate": 9.79731357109773e-07,
      "loss": -0.0356,
      "num_tokens": 60197575.0,
      "reward": 0.8571484088897705,
      "reward_std": 0.22958418726921082,
      "rewards/reward_func/mean": 0.8571484088897705,
      "rewards/reward_func/std": 0.22958418726921082,
      "step": 2189,
      "step_time": 25.103964366018772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 167.5,
      "completions/mean_terminated_length": 167.5,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3038442134857178,
      "epoch": 0.10143584993052339,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028399541042745113,
      "kl": 0.0025349673815071583,
      "learning_rate": 9.79722093561834e-07,
      "loss": 0.0001,
      "num_tokens": 60235167.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2190,
      "step_time": 24.349001679569483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 136.25,
      "completions/mean_terminated_length": 136.25,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.32105864584445953,
      "epoch": 0.1014821676702177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001613329048268497,
      "kl": 0.0014620284782722592,
      "learning_rate": 9.797128300138952e-07,
      "loss": 0.0001,
      "num_tokens": 60263635.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2191,
      "step_time": 16.700292360037565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 230.4375,
      "completions/mean_terminated_length": 230.4375,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.2706785500049591,
      "epoch": 0.101528485409912,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11339399218559265,
      "kl": 0.002971329726278782,
      "learning_rate": 9.797035664659564e-07,
      "loss": -0.0437,
      "num_tokens": 60290618.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2192,
      "step_time": 24.38199918717146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 127.4375,
      "completions/mean_terminated_length": 127.4375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.25459761917591095,
      "epoch": 0.1015748031496063,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025886453222483397,
      "kl": 0.0016027696547098458,
      "learning_rate": 9.796943029180175e-07,
      "loss": 0.0001,
      "num_tokens": 60310017.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2193,
      "step_time": 13.862449113279581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 207.9375,
      "completions/mean_terminated_length": 207.9375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.3888969272375107,
      "epoch": 0.1016211208893006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028763385489583015,
      "kl": 0.0023363720101770014,
      "learning_rate": 9.796850393700788e-07,
      "loss": 0.0001,
      "num_tokens": 60336672.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2194,
      "step_time": 21.99916561320424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 365.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 226.5625,
      "completions/mean_terminated_length": 226.5625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3594680428504944,
      "epoch": 0.10166743862899491,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08128580451011658,
      "kl": 0.0030222826171666384,
      "learning_rate": 9.796757758221397e-07,
      "loss": -0.1862,
      "num_tokens": 60375545.0,
      "reward": 0.17653314769268036,
      "reward_std": 0.37953487038612366,
      "rewards/reward_func/mean": 0.17653314769268036,
      "rewards/reward_func/std": 0.37953487038612366,
      "step": 2195,
      "step_time": 33.69244493171573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 239.375,
      "completions/mean_terminated_length": 239.375,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.23426999151706696,
      "epoch": 0.1017137563686892,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07784437388181686,
      "kl": 0.0045803755056113005,
      "learning_rate": 9.796665122742009e-07,
      "loss": -0.0354,
      "num_tokens": 60413903.0,
      "reward": 0.7629547715187073,
      "reward_std": 0.31606027483940125,
      "rewards/reward_func/mean": 0.7629547715187073,
      "rewards/reward_func/std": 0.31606027483940125,
      "step": 2196,
      "step_time": 27.606477454304695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 144.5,
      "completions/mean_terminated_length": 144.5,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.1536317691206932,
      "epoch": 0.10176007410838352,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10756503790616989,
      "kl": 0.012938304804265499,
      "learning_rate": 9.796572487262622e-07,
      "loss": -0.0196,
      "num_tokens": 60436695.0,
      "reward": 0.9304066896438599,
      "reward_std": 0.19016513228416443,
      "rewards/reward_func/mean": 0.9304066896438599,
      "rewards/reward_func/std": 0.19016513228416443,
      "step": 2197,
      "step_time": 15.513894945383072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 232.125,
      "completions/mean_terminated_length": 232.125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.3011670559644699,
      "epoch": 0.10180639184807781,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08634068071842194,
      "kl": 0.004210830433294177,
      "learning_rate": 9.796479851783233e-07,
      "loss": 0.0109,
      "num_tokens": 60465065.0,
      "reward": 0.4947678744792938,
      "reward_std": 0.13267172873020172,
      "rewards/reward_func/mean": 0.4947678744792938,
      "rewards/reward_func/std": 0.13267171382904053,
      "step": 2198,
      "step_time": 25.829206820577383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 144.125,
      "completions/mean_terminated_length": 144.125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3724542334675789,
      "epoch": 0.10185270958777212,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014387345872819424,
      "kl": 0.0015553278208244592,
      "learning_rate": 9.796387216303844e-07,
      "loss": 0.0001,
      "num_tokens": 60490875.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2199,
      "step_time": 16.612209875136614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 206.9375,
      "completions/mean_terminated_length": 206.9375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.275962233543396,
      "epoch": 0.10189902732746642,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0932672768831253,
      "kl": 0.005184296751394868,
      "learning_rate": 9.796294580824456e-07,
      "loss": -0.0711,
      "num_tokens": 60514410.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 2200,
      "step_time": 22.264834202826023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 215.5625,
      "completions/mean_terminated_length": 215.5625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.27668239921331406,
      "epoch": 0.10194534506716073,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12937963008880615,
      "kl": 0.00448174070334062,
      "learning_rate": 9.796201945345067e-07,
      "loss": -0.1306,
      "num_tokens": 60550947.0,
      "reward": 0.5384228229522705,
      "reward_std": 0.21965906023979187,
      "rewards/reward_func/mean": 0.5384228229522705,
      "rewards/reward_func/std": 0.21965906023979187,
      "step": 2201,
      "step_time": 29.706990282982588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 178.875,
      "completions/mean_terminated_length": 178.875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3306841403245926,
      "epoch": 0.10199166280685502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006056199781596661,
      "kl": 0.003290728200227022,
      "learning_rate": 9.796109309865678e-07,
      "loss": 0.0002,
      "num_tokens": 60594753.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2202,
      "step_time": 24.14627345278859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 184.625,
      "completions/mean_terminated_length": 184.625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.36340101063251495,
      "epoch": 0.10203798054654933,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016917209140956402,
      "kl": 0.001690033939667046,
      "learning_rate": 9.79601667438629e-07,
      "loss": 0.0001,
      "num_tokens": 60616251.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2203,
      "step_time": 19.759938970208168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 252.1875,
      "completions/mean_terminated_length": 252.1875,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "entropy": 0.3854904919862747,
      "epoch": 0.10208429828624363,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07628150284290314,
      "kl": 0.0016758107813075185,
      "learning_rate": 9.7959240389069e-07,
      "loss": 0.0077,
      "num_tokens": 60656286.0,
      "reward": 0.7000828385353088,
      "reward_std": 0.005255452822893858,
      "rewards/reward_func/mean": 0.7000828385353088,
      "rewards/reward_func/std": 0.005255445837974548,
      "step": 2204,
      "step_time": 28.58991450443864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 146.0625,
      "completions/mean_terminated_length": 146.0625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2740960121154785,
      "epoch": 0.10213061602593794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003894086927175522,
      "kl": 0.0018835398368537426,
      "learning_rate": 9.795831403427512e-07,
      "loss": 0.0001,
      "num_tokens": 60679791.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2205,
      "step_time": 17.592174660414457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 113.25,
      "completions/mean_terminated_length": 113.25,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.23828605934977531,
      "epoch": 0.10217693376563224,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019345965702086687,
      "kl": 0.0015995536523405463,
      "learning_rate": 9.795738767948123e-07,
      "loss": 0.0001,
      "num_tokens": 60699443.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2206,
      "step_time": 13.819919727742672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 183.625,
      "completions/mean_terminated_length": 183.625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3812277764081955,
      "epoch": 0.10222325150532655,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10260231047868729,
      "kl": 0.002770874183624983,
      "learning_rate": 9.795646132468737e-07,
      "loss": 0.0356,
      "num_tokens": 60721533.0,
      "reward": 0.7803013324737549,
      "reward_std": 0.23197969794273376,
      "rewards/reward_func/mean": 0.7803013324737549,
      "rewards/reward_func/std": 0.23197971284389496,
      "step": 2207,
      "step_time": 19.418404404073954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 245.8125,
      "completions/mean_terminated_length": 245.8125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.37642689049243927,
      "epoch": 0.10226956924502084,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09306300431489944,
      "kl": 0.002558030537329614,
      "learning_rate": 9.795553496989346e-07,
      "loss": 0.1388,
      "num_tokens": 60745098.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2208,
      "step_time": 29.97770418971777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 132.5,
      "completions/mean_terminated_length": 132.5,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.20292287319898605,
      "epoch": 0.10231588698471515,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004116197116672993,
      "kl": 0.0016079325578175485,
      "learning_rate": 9.795460861509957e-07,
      "loss": 0.0001,
      "num_tokens": 60764626.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2209,
      "step_time": 15.321424350142479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 171.625,
      "completions/mean_terminated_length": 171.625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.1554102674126625,
      "epoch": 0.10236220472440945,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011155969696119428,
      "kl": 0.0008762629440752789,
      "learning_rate": 9.79536822603057e-07,
      "loss": 0.0,
      "num_tokens": 60787372.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 2210,
      "step_time": 17.346901450306177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 116.125,
      "completions/mean_terminated_length": 116.125,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.23476694524288177,
      "epoch": 0.10240852246410376,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002404983853921294,
      "kl": 0.0013003416825085878,
      "learning_rate": 9.795275590551182e-07,
      "loss": 0.0001,
      "num_tokens": 60807022.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2211,
      "step_time": 13.673759322613478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 353.0,
      "completions/max_terminated_length": 353.0,
      "completions/mean_length": 286.25,
      "completions/mean_terminated_length": 286.25,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "entropy": 0.24284956231713295,
      "epoch": 0.10245484020379805,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021608276292681694,
      "kl": 0.001618764887098223,
      "learning_rate": 9.795182955071793e-07,
      "loss": 0.0001,
      "num_tokens": 60835042.0,
      "reward": 0.9368637800216675,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9368637800216675,
      "rewards/reward_func/std": 0.0,
      "step": 2212,
      "step_time": 30.086726807057858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 120.625,
      "completions/mean_terminated_length": 120.625,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.24929409474134445,
      "epoch": 0.10250115794349236,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024518666323274374,
      "kl": 0.0014328828256111592,
      "learning_rate": 9.795090319592404e-07,
      "loss": 0.0001,
      "num_tokens": 60856780.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2213,
      "step_time": 13.360966384410858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 168.25,
      "completions/mean_terminated_length": 168.25,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.38240545988082886,
      "epoch": 0.10254747568318666,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022746773902326822,
      "kl": 0.002078428486129269,
      "learning_rate": 9.794997684113015e-07,
      "loss": 0.0001,
      "num_tokens": 60906128.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2214,
      "step_time": 23.992529205977917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 153.875,
      "completions/mean_terminated_length": 153.875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.1575748734176159,
      "epoch": 0.10259379342288097,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004232973325997591,
      "kl": 0.0023800735943950713,
      "learning_rate": 9.794905048633627e-07,
      "loss": 0.0001,
      "num_tokens": 60942366.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2215,
      "step_time": 20.61464837566018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 105.375,
      "completions/mean_terminated_length": 105.375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.31054776161909103,
      "epoch": 0.10264011116257526,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016936726169660687,
      "kl": 0.001551642024423927,
      "learning_rate": 9.794812413154238e-07,
      "loss": 0.0001,
      "num_tokens": 60962292.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2216,
      "step_time": 13.38803456351161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 146.625,
      "completions/mean_terminated_length": 146.625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3204849287867546,
      "epoch": 0.10268642890226957,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010563879273831844,
      "kl": 0.004847561183851212,
      "learning_rate": 9.79471977767485e-07,
      "loss": 0.0002,
      "num_tokens": 60982606.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2217,
      "step_time": 15.205908689647913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 205.0625,
      "completions/mean_terminated_length": 205.0625,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.27916259318590164,
      "epoch": 0.10273274664196387,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000970926892478019,
      "kl": 0.0012351367040537298,
      "learning_rate": 9.79462714219546e-07,
      "loss": 0.0001,
      "num_tokens": 61026831.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2218,
      "step_time": 26.449595969170332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 121.8125,
      "completions/mean_terminated_length": 121.8125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.27991413325071335,
      "epoch": 0.10277906438165818,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008126037195324898,
      "kl": 0.003011845867149532,
      "learning_rate": 9.794534506716072e-07,
      "loss": 0.0001,
      "num_tokens": 61056252.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2219,
      "step_time": 16.91433237120509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 189.4375,
      "completions/mean_terminated_length": 189.4375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.4102216511964798,
      "epoch": 0.10282538212135248,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11200933903455734,
      "kl": 0.005165300099179149,
      "learning_rate": 9.794441871236683e-07,
      "loss": -0.052,
      "num_tokens": 61082083.0,
      "reward": 0.27963730692863464,
      "reward_std": 0.42837172746658325,
      "rewards/reward_func/mean": 0.27963730692863464,
      "rewards/reward_func/std": 0.42837172746658325,
      "step": 2220,
      "step_time": 24.32019878551364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 167.625,
      "completions/mean_terminated_length": 167.625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.35585759580135345,
      "epoch": 0.10287169986104679,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016712772194296122,
      "kl": 0.0015667208936065435,
      "learning_rate": 9.794349235757294e-07,
      "loss": 0.0001,
      "num_tokens": 61116269.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2221,
      "step_time": 20.54244640469551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 122.5625,
      "completions/mean_terminated_length": 122.5625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.29270917922258377,
      "epoch": 0.10291801760074108,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013240029802545905,
      "kl": 0.0016460044425912201,
      "learning_rate": 9.794256600277905e-07,
      "loss": 0.0001,
      "num_tokens": 61136070.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2222,
      "step_time": 13.899787869304419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 172.625,
      "completions/mean_terminated_length": 172.625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.23054639250040054,
      "epoch": 0.10296433534043539,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15053316950798035,
      "kl": 0.002577450970420614,
      "learning_rate": 9.794163964798516e-07,
      "loss": -0.0639,
      "num_tokens": 61163856.0,
      "reward": 0.909626841545105,
      "reward_std": 0.02409949339926243,
      "rewards/reward_func/mean": 0.909626841545105,
      "rewards/reward_func/std": 0.024099500849843025,
      "step": 2223,
      "step_time": 20.294577702879906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 174.1875,
      "completions/mean_terminated_length": 174.1875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.40985822677612305,
      "epoch": 0.10301065308012969,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002675432711839676,
      "kl": 0.0021192085405346006,
      "learning_rate": 9.79407132931913e-07,
      "loss": 0.0001,
      "num_tokens": 61197347.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2224,
      "step_time": 21.854994174093008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 236.6875,
      "completions/mean_terminated_length": 236.6875,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "entropy": 0.2156229093670845,
      "epoch": 0.103056970819824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06497209519147873,
      "kl": 0.0016112701559904963,
      "learning_rate": 9.793978693839741e-07,
      "loss": -0.0143,
      "num_tokens": 61236030.0,
      "reward": 0.4116564095020294,
      "reward_std": 0.15257705748081207,
      "rewards/reward_func/mean": 0.4116564095020294,
      "rewards/reward_func/std": 0.15257705748081207,
      "step": 2225,
      "step_time": 26.80081870406866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 165.375,
      "completions/mean_terminated_length": 165.375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.403880774974823,
      "epoch": 0.1031032885595183,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007626906502991915,
      "kl": 0.004239951784256846,
      "learning_rate": 9.79388605836035e-07,
      "loss": 0.0002,
      "num_tokens": 61284100.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2226,
      "step_time": 24.646736599504948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 179.375,
      "completions/mean_terminated_length": 179.375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3995928168296814,
      "epoch": 0.1031496062992126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026067052967846394,
      "kl": 0.0024176547303795815,
      "learning_rate": 9.793793422880964e-07,
      "loss": 0.0001,
      "num_tokens": 61305754.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2227,
      "step_time": 19.135168179869652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 115.125,
      "completions/mean_terminated_length": 115.125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.299918569624424,
      "epoch": 0.1031959240389069,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026245375629514456,
      "kl": 0.0017001339292619377,
      "learning_rate": 9.793700787401575e-07,
      "loss": 0.0001,
      "num_tokens": 61327084.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2228,
      "step_time": 13.109154216945171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 132.625,
      "completions/mean_terminated_length": 132.625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2734079398214817,
      "epoch": 0.10324224177860121,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00537524838000536,
      "kl": 0.002722400415223092,
      "learning_rate": 9.793608151922186e-07,
      "loss": 0.0001,
      "num_tokens": 61347478.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2229,
      "step_time": 14.749796010553837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3564213216304779,
      "epoch": 0.1032885595182955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014428168069571257,
      "kl": 0.0014983734581619501,
      "learning_rate": 9.793515516442797e-07,
      "loss": 0.0001,
      "num_tokens": 61379161.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2230,
      "step_time": 18.56022620201111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 247.5625,
      "completions/mean_terminated_length": 247.5625,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.19931424036622047,
      "epoch": 0.10333487725798982,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09483068436384201,
      "kl": 0.002567407733295113,
      "learning_rate": 9.793422880963409e-07,
      "loss": -0.0195,
      "num_tokens": 61414754.0,
      "reward": 0.18062886595726013,
      "reward_std": 0.05885840579867363,
      "rewards/reward_func/mean": 0.18062886595726013,
      "rewards/reward_func/std": 0.05885840579867363,
      "step": 2231,
      "step_time": 26.85329046472907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 232.0,
      "completions/mean_terminated_length": 232.0,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.2816247157752514,
      "epoch": 0.10338119499768411,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11701363325119019,
      "kl": 0.004782899166457355,
      "learning_rate": 9.79333024548402e-07,
      "loss": -0.0859,
      "num_tokens": 61439506.0,
      "reward": 0.5287646055221558,
      "reward_std": 0.4860803186893463,
      "rewards/reward_func/mean": 0.5287646055221558,
      "rewards/reward_func/std": 0.4860803186893463,
      "step": 2232,
      "step_time": 23.685327105224133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 118.4375,
      "completions/mean_terminated_length": 118.4375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2728803679347038,
      "epoch": 0.10342751273737842,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003786456538364291,
      "kl": 0.0021344450942706317,
      "learning_rate": 9.793237610004631e-07,
      "loss": 0.0001,
      "num_tokens": 61460505.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2233,
      "step_time": 13.11510282382369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 158.25,
      "completions/mean_terminated_length": 158.25,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.36266087740659714,
      "epoch": 0.10347383047707272,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002360459417104721,
      "kl": 0.001996085455175489,
      "learning_rate": 9.793144974525242e-07,
      "loss": 0.0001,
      "num_tokens": 61491837.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2234,
      "step_time": 19.55891367048025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 192.9375,
      "completions/mean_terminated_length": 192.9375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.22553859278559685,
      "epoch": 0.10352014821676703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029795700684189796,
      "kl": 0.002468937949743122,
      "learning_rate": 9.793052339045854e-07,
      "loss": 0.0001,
      "num_tokens": 61528524.0,
      "reward": 0.9661049842834473,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9661049842834473,
      "rewards/reward_func/std": 0.0,
      "step": 2235,
      "step_time": 23.35491492599249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 162.375,
      "completions/mean_terminated_length": 162.375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.2922440990805626,
      "epoch": 0.10356646595646132,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036938011180609465,
      "kl": 0.0020365714735817164,
      "learning_rate": 9.792959703566465e-07,
      "loss": 0.0001,
      "num_tokens": 61548930.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2236,
      "step_time": 18.090626504272223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 182.5625,
      "completions/mean_terminated_length": 182.5625,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.12921619042754173,
      "epoch": 0.10361278369615563,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010431903647258878,
      "kl": 0.0007840915495762601,
      "learning_rate": 9.792867068087078e-07,
      "loss": 0.0,
      "num_tokens": 61573051.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2237,
      "step_time": 19.422021348029375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 137.625,
      "completions/mean_terminated_length": 137.625,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2800588682293892,
      "epoch": 0.10365910143584993,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001400663866661489,
      "kl": 0.0014587149489670992,
      "learning_rate": 9.792774432607687e-07,
      "loss": 0.0001,
      "num_tokens": 61598869.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2238,
      "step_time": 17.431943271309137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 159.0,
      "completions/mean_terminated_length": 159.0,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2698532044887543,
      "epoch": 0.10370541917554424,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040842462331056595,
      "kl": 0.0030865041771903634,
      "learning_rate": 9.792681797128299e-07,
      "loss": 0.0002,
      "num_tokens": 61623189.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 2239,
      "step_time": 17.110507179051638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 160.5625,
      "completions/mean_terminated_length": 160.5625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3302495777606964,
      "epoch": 0.10375173691523853,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007209908217191696,
      "kl": 0.003326426784042269,
      "learning_rate": 9.792589161648912e-07,
      "loss": 0.0002,
      "num_tokens": 61645742.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2240,
      "step_time": 17.75124305486679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 167.625,
      "completions/mean_terminated_length": 167.625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.19000709801912308,
      "epoch": 0.10379805465493284,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09699032455682755,
      "kl": 0.005310878332238644,
      "learning_rate": 9.792496526169523e-07,
      "loss": -0.0233,
      "num_tokens": 61672648.0,
      "reward": 0.5217881202697754,
      "reward_std": 0.38342076539993286,
      "rewards/reward_func/mean": 0.5217881202697754,
      "rewards/reward_func/std": 0.3834207057952881,
      "step": 2241,
      "step_time": 18.545949436724186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 119.5625,
      "completions/mean_terminated_length": 119.5625,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.2775387093424797,
      "epoch": 0.10384437239462714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009978383779525757,
      "kl": 0.0031977643084246665,
      "learning_rate": 9.792403890690134e-07,
      "loss": 0.0002,
      "num_tokens": 61694353.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2242,
      "step_time": 13.95238033682108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 196.0,
      "completions/mean_terminated_length": 196.0,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.32570119202136993,
      "epoch": 0.10389069013432145,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13308730721473694,
      "kl": 0.006898187682963908,
      "learning_rate": 9.792311255210746e-07,
      "loss": -0.0075,
      "num_tokens": 61716193.0,
      "reward": 0.22180122137069702,
      "reward_std": 0.39677008986473083,
      "rewards/reward_func/mean": 0.22180122137069702,
      "rewards/reward_func/std": 0.3967701196670532,
      "step": 2243,
      "step_time": 20.08134526014328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 218.25,
      "completions/mean_terminated_length": 218.25,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.2500312253832817,
      "epoch": 0.10393700787401575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10359306633472443,
      "kl": 0.004036617727251723,
      "learning_rate": 9.792218619731357e-07,
      "loss": -0.0516,
      "num_tokens": 61739701.0,
      "reward": 0.35930418968200684,
      "reward_std": 0.2904965281486511,
      "rewards/reward_func/mean": 0.35930418968200684,
      "rewards/reward_func/std": 0.2904965877532959,
      "step": 2244,
      "step_time": 22.156222824007273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 179.875,
      "completions/mean_terminated_length": 179.875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.38613519072532654,
      "epoch": 0.10398332561371006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006123799365013838,
      "kl": 0.003989014774560928,
      "learning_rate": 9.792125984251968e-07,
      "loss": 0.0002,
      "num_tokens": 61766723.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2245,
      "step_time": 19.492330126464367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 141.0,
      "completions/mean_terminated_length": 141.0,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3111994042992592,
      "epoch": 0.10402964335340435,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001901285257190466,
      "kl": 0.0015827443567104638,
      "learning_rate": 9.79203334877258e-07,
      "loss": 0.0001,
      "num_tokens": 61802835.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2246,
      "step_time": 18.98337061330676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 131.125,
      "completions/mean_terminated_length": 131.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.27539508789777756,
      "epoch": 0.10407596109309866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011728698154911399,
      "kl": 0.001241259480593726,
      "learning_rate": 9.79194071329319e-07,
      "loss": 0.0001,
      "num_tokens": 61829397.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2247,
      "step_time": 15.216758225113153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 212.9375,
      "completions/mean_terminated_length": 212.9375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.22671232372522354,
      "epoch": 0.10412227883279296,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07702209055423737,
      "kl": 0.0026329063693992794,
      "learning_rate": 9.791848077813802e-07,
      "loss": -0.0975,
      "num_tokens": 61852516.0,
      "reward": 0.7919193506240845,
      "reward_std": 0.3722260296344757,
      "rewards/reward_func/mean": 0.7919193506240845,
      "rewards/reward_func/std": 0.3722260594367981,
      "step": 2248,
      "step_time": 22.158296890556812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 140.0,
      "completions/mean_terminated_length": 140.0,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.21035386621952057,
      "epoch": 0.10416859657248727,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11730952560901642,
      "kl": 0.004817256878595799,
      "learning_rate": 9.791755442334413e-07,
      "loss": -0.0703,
      "num_tokens": 61884212.0,
      "reward": 0.3567105531692505,
      "reward_std": 0.35956937074661255,
      "rewards/reward_func/mean": 0.3567105531692505,
      "rewards/reward_func/std": 0.35956940054893494,
      "step": 2249,
      "step_time": 18.170773435384035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 208.1875,
      "completions/mean_terminated_length": 208.1875,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.21844256669282913,
      "epoch": 0.10421491431218156,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034418818540871143,
      "kl": 0.00255079276394099,
      "learning_rate": 9.791662806855027e-07,
      "loss": 0.0001,
      "num_tokens": 61912503.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2250,
      "step_time": 21.4816131927073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 166.3125,
      "completions/mean_terminated_length": 166.3125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.39760953187942505,
      "epoch": 0.10426123205187587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021134852431714535,
      "kl": 0.0018708638963289559,
      "learning_rate": 9.791570171375636e-07,
      "loss": 0.0001,
      "num_tokens": 61945644.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2251,
      "step_time": 20.125511031597853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 112.4375,
      "completions/mean_terminated_length": 112.4375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.3092518672347069,
      "epoch": 0.10430754979157017,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028821611776947975,
      "kl": 0.0024511757073923945,
      "learning_rate": 9.791477535896247e-07,
      "loss": 0.0001,
      "num_tokens": 61967811.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2252,
      "step_time": 13.239852078258991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 149.0625,
      "completions/mean_terminated_length": 149.0625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3802819848060608,
      "epoch": 0.10435386753126448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005940168164670467,
      "kl": 0.0029723774059675634,
      "learning_rate": 9.791384900416858e-07,
      "loss": 0.0001,
      "num_tokens": 61988980.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2253,
      "step_time": 17.387725837528706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 156.1875,
      "completions/mean_terminated_length": 156.1875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.27446040511131287,
      "epoch": 0.10440018527095878,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002605691086500883,
      "kl": 0.00233684346312657,
      "learning_rate": 9.791292264937472e-07,
      "loss": 0.0001,
      "num_tokens": 62038871.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2254,
      "step_time": 22.929054759442806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 126.625,
      "completions/mean_terminated_length": 126.625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.1548544466495514,
      "epoch": 0.10444650301065309,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018125120550394058,
      "kl": 0.0014349717530421913,
      "learning_rate": 9.791199629458083e-07,
      "loss": 0.0001,
      "num_tokens": 62059761.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 2255,
      "step_time": 13.319679863750935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 164.6875,
      "completions/mean_terminated_length": 164.6875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.36026181280612946,
      "epoch": 0.10449282075034738,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0051782578229904175,
      "kl": 0.003570305823814124,
      "learning_rate": 9.791106993978694e-07,
      "loss": 0.0002,
      "num_tokens": 62080364.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2256,
      "step_time": 17.787369839847088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 191.125,
      "completions/mean_terminated_length": 191.125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.417837455868721,
      "epoch": 0.10453913849004169,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002698037540540099,
      "kl": 0.0022583184181712568,
      "learning_rate": 9.791014358499305e-07,
      "loss": 0.0001,
      "num_tokens": 62109182.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2257,
      "step_time": 22.217093612998724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 163.875,
      "completions/mean_terminated_length": 163.875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.329865463078022,
      "epoch": 0.10458545622973599,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004828733392059803,
      "kl": 0.003312196582555771,
      "learning_rate": 9.790921723019917e-07,
      "loss": 0.0002,
      "num_tokens": 62130732.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2258,
      "step_time": 18.443362843245268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 172.3125,
      "completions/mean_terminated_length": 172.3125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.29036425799131393,
      "epoch": 0.1046317739694303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002593018813058734,
      "kl": 0.0017635401454754174,
      "learning_rate": 9.790829087540528e-07,
      "loss": 0.0001,
      "num_tokens": 62151313.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2259,
      "step_time": 18.20293040201068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 506.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 472.125,
      "completions/mean_terminated_length": 472.125,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "entropy": 0.09846077300608158,
      "epoch": 0.1046780917091246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004796440480276942,
      "kl": 0.0004201960182399489,
      "learning_rate": 9.79073645206114e-07,
      "loss": 0.0,
      "num_tokens": 62195827.0,
      "reward": 0.49658530950546265,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.49658530950546265,
      "rewards/reward_func/std": 0.0,
      "step": 2260,
      "step_time": 43.809696685522795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 175.3125,
      "completions/mean_terminated_length": 175.3125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.39920640736818314,
      "epoch": 0.1047244094488189,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035484484396874905,
      "kl": 0.0028538938495330513,
      "learning_rate": 9.79064381658175e-07,
      "loss": 0.0001,
      "num_tokens": 62218264.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2261,
      "step_time": 19.64612052962184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 133.75,
      "completions/mean_terminated_length": 133.75,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.26653461903333664,
      "epoch": 0.1047707271885132,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003153836587443948,
      "kl": 0.0017807056137826294,
      "learning_rate": 9.790551181102362e-07,
      "loss": 0.0001,
      "num_tokens": 62237796.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2262,
      "step_time": 13.603754296898842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 150.6875,
      "completions/mean_terminated_length": 150.6875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.21984482184052467,
      "epoch": 0.10481704492820751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004111500922590494,
      "kl": 0.002681127982214093,
      "learning_rate": 9.790458545622973e-07,
      "loss": 0.0001,
      "num_tokens": 62259887.0,
      "reward": 0.8781879544258118,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8781879544258118,
      "rewards/reward_func/std": 0.0,
      "step": 2263,
      "step_time": 18.170254323631525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 145.125,
      "completions/mean_terminated_length": 145.125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3419824466109276,
      "epoch": 0.1048633626679018,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005954916123300791,
      "kl": 0.005005842307582498,
      "learning_rate": 9.790365910143584e-07,
      "loss": 0.0003,
      "num_tokens": 62279793.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2264,
      "step_time": 15.43479885160923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 119.6875,
      "completions/mean_terminated_length": 119.6875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2846032828092575,
      "epoch": 0.10490968040759611,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015523831825703382,
      "kl": 0.0015785988653078675,
      "learning_rate": 9.790273274664195e-07,
      "loss": 0.0001,
      "num_tokens": 62300156.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2265,
      "step_time": 13.51823365315795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 216.375,
      "completions/mean_terminated_length": 216.375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.4687364399433136,
      "epoch": 0.10495599814729041,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10033821314573288,
      "kl": 0.005565599538385868,
      "learning_rate": 9.790180639184807e-07,
      "loss": -0.0393,
      "num_tokens": 62325186.0,
      "reward": 0.07428629696369171,
      "reward_std": 0.24917595088481903,
      "rewards/reward_func/mean": 0.07428629696369171,
      "rewards/reward_func/std": 0.24917596578598022,
      "step": 2266,
      "step_time": 23.35210544988513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 141.1875,
      "completions/mean_terminated_length": 141.1875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.34766803681850433,
      "epoch": 0.10500231588698472,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001703002955764532,
      "kl": 0.001662793365539983,
      "learning_rate": 9.79008800370542e-07,
      "loss": 0.0001,
      "num_tokens": 62353349.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2267,
      "step_time": 16.5883132442832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.41440922021865845,
      "epoch": 0.10504863362667902,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042607043869793415,
      "kl": 0.0029541688272729516,
      "learning_rate": 9.789995368226031e-07,
      "loss": 0.0001,
      "num_tokens": 62377757.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2268,
      "step_time": 18.798588771373034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 147.3125,
      "completions/mean_terminated_length": 147.3125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.17437032982707024,
      "epoch": 0.10509495136637333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024604839272797108,
      "kl": 0.0013842256157658994,
      "learning_rate": 9.78990273274664e-07,
      "loss": 0.0001,
      "num_tokens": 62407154.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2269,
      "step_time": 17.567027255892754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 170.125,
      "completions/mean_terminated_length": 170.125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3062300756573677,
      "epoch": 0.10514126910606762,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024556834250688553,
      "kl": 0.0015224769886117429,
      "learning_rate": 9.789810097267252e-07,
      "loss": 0.0001,
      "num_tokens": 62438820.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2270,
      "step_time": 19.446268923580647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.18224821239709854,
      "epoch": 0.10518758684576193,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11770366132259369,
      "kl": 0.0016775375988800079,
      "learning_rate": 9.789717461787865e-07,
      "loss": 0.0153,
      "num_tokens": 62471883.0,
      "reward": 0.9904050827026367,
      "reward_std": 0.03837955743074417,
      "rewards/reward_func/mean": 0.9904050827026367,
      "rewards/reward_func/std": 0.038379568606615067,
      "step": 2271,
      "step_time": 20.685834363102913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 140.625,
      "completions/mean_terminated_length": 140.625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3279189094901085,
      "epoch": 0.10523390458545623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022518103942275047,
      "kl": 0.0020676348358392715,
      "learning_rate": 9.789624826308476e-07,
      "loss": 0.0001,
      "num_tokens": 62498741.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2272,
      "step_time": 17.294695295393467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 157.6875,
      "completions/mean_terminated_length": 157.6875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.2894827052950859,
      "epoch": 0.10528022232515054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004258665721863508,
      "kl": 0.002303234301507473,
      "learning_rate": 9.789532190829087e-07,
      "loss": 0.0001,
      "num_tokens": 62519328.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2273,
      "step_time": 16.723687145859003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 159.25,
      "completions/mean_terminated_length": 159.25,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.22289356961846352,
      "epoch": 0.10532654006484483,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034214637707918882,
      "kl": 0.0023974603973329067,
      "learning_rate": 9.789439555349699e-07,
      "loss": 0.0001,
      "num_tokens": 62540564.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 2274,
      "step_time": 16.21651890128851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 112.6875,
      "completions/mean_terminated_length": 112.6875,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.24744964390993118,
      "epoch": 0.10537285780453914,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016079018823802471,
      "kl": 0.0013612003094749525,
      "learning_rate": 9.78934691987031e-07,
      "loss": 0.0001,
      "num_tokens": 62560495.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2275,
      "step_time": 12.973549351096153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 156.4375,
      "completions/mean_terminated_length": 156.4375,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.4443458691239357,
      "epoch": 0.10541917554423344,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036383571568876505,
      "kl": 0.0024798159720376134,
      "learning_rate": 9.789254284390921e-07,
      "loss": 0.0001,
      "num_tokens": 62607318.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2276,
      "step_time": 23.97321081161499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 219.8125,
      "completions/mean_terminated_length": 219.8125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.39555609226226807,
      "epoch": 0.10546549328392775,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07289256900548935,
      "kl": 0.002941161103080958,
      "learning_rate": 9.789161648911532e-07,
      "loss": 0.0065,
      "num_tokens": 62629731.0,
      "reward": 0.8440167903900146,
      "reward_std": 0.06088960915803909,
      "rewards/reward_func/mean": 0.8440167903900146,
      "rewards/reward_func/std": 0.06088960915803909,
      "step": 2277,
      "step_time": 22.16989605501294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 218.0625,
      "completions/mean_terminated_length": 218.0625,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.32909025996923447,
      "epoch": 0.10551181102362205,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009037869051098824,
      "kl": 0.005931343417614698,
      "learning_rate": 9.789069013432144e-07,
      "loss": 0.0003,
      "num_tokens": 62668852.0,
      "reward": 0.14145326614379883,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.14145326614379883,
      "rewards/reward_func/std": 0.0,
      "step": 2278,
      "step_time": 25.76391412690282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 147.0,
      "completions/mean_terminated_length": 147.0,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3768761530518532,
      "epoch": 0.10555812876331636,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002742957789450884,
      "kl": 0.0020997224492020905,
      "learning_rate": 9.788976377952755e-07,
      "loss": 0.0001,
      "num_tokens": 62689572.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2279,
      "step_time": 15.67442176118493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 197.75,
      "completions/mean_terminated_length": 197.75,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.3708306774497032,
      "epoch": 0.10560444650301065,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017952244961634278,
      "kl": 0.0017793258593883365,
      "learning_rate": 9.788883742473368e-07,
      "loss": 0.0001,
      "num_tokens": 62721984.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2280,
      "step_time": 22.377992317080498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 170.5,
      "completions/mean_terminated_length": 170.5,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.40220367163419724,
      "epoch": 0.10565076424270496,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036408279556781054,
      "kl": 0.00243093614699319,
      "learning_rate": 9.788791106993977e-07,
      "loss": 0.0001,
      "num_tokens": 62745208.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2281,
      "step_time": 19.542316388338804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 198.0625,
      "completions/mean_terminated_length": 198.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.37135354429483414,
      "epoch": 0.10569708198239926,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20602072775363922,
      "kl": 0.013688758946955204,
      "learning_rate": 9.788698471514589e-07,
      "loss": -0.076,
      "num_tokens": 62768873.0,
      "reward": 0.46970653533935547,
      "reward_std": 0.4851108193397522,
      "rewards/reward_func/mean": 0.46970653533935547,
      "rewards/reward_func/std": 0.4851108193397522,
      "step": 2282,
      "step_time": 24.103362884372473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 201.875,
      "completions/mean_terminated_length": 201.875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3340918496251106,
      "epoch": 0.10574339972209357,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10164111852645874,
      "kl": 0.005126326461322606,
      "learning_rate": 9.7886058360352e-07,
      "loss": -0.1064,
      "num_tokens": 62792647.0,
      "reward": 0.23367546498775482,
      "reward_std": 0.42207178473472595,
      "rewards/reward_func/mean": 0.23367546498775482,
      "rewards/reward_func/std": 0.42207178473472595,
      "step": 2283,
      "step_time": 23.066465586423874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.35164161026477814,
      "epoch": 0.10578971746178786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016053604194894433,
      "kl": 0.0018234541930723935,
      "learning_rate": 9.788513200555813e-07,
      "loss": 0.0001,
      "num_tokens": 62814044.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2284,
      "step_time": 19.28101134300232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 153.8125,
      "completions/mean_terminated_length": 153.8125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3576444014906883,
      "epoch": 0.10583603520148217,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002952948911115527,
      "kl": 0.0024166183138731867,
      "learning_rate": 9.788420565076425e-07,
      "loss": 0.0001,
      "num_tokens": 62835513.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2285,
      "step_time": 16.41217329725623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 155.4375,
      "completions/mean_terminated_length": 155.4375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.18707425519824028,
      "epoch": 0.10588235294117647,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003922322764992714,
      "kl": 0.002202034753281623,
      "learning_rate": 9.788327929597036e-07,
      "loss": 0.0001,
      "num_tokens": 62862640.0,
      "reward": 0.5682365894317627,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5682365894317627,
      "rewards/reward_func/std": 0.0,
      "step": 2286,
      "step_time": 17.08743765950203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 175.6875,
      "completions/mean_terminated_length": 175.6875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.2407989427447319,
      "epoch": 0.10592867068087078,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002489193808287382,
      "kl": 0.0017187929479405284,
      "learning_rate": 9.788235294117647e-07,
      "loss": 0.0001,
      "num_tokens": 62887691.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2287,
      "step_time": 18.360634196549654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 414.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 210.1875,
      "completions/mean_terminated_length": 210.1875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3300726041197777,
      "epoch": 0.10597498842056507,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13574030995368958,
      "kl": 0.00281023868592456,
      "learning_rate": 9.788142658638258e-07,
      "loss": 0.3027,
      "num_tokens": 62913230.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2288,
      "step_time": 33.76765315979719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 237.625,
      "completions/mean_terminated_length": 237.625,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "entropy": 0.19006231427192688,
      "epoch": 0.10602130616025938,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003184650093317032,
      "kl": 0.002425000420771539,
      "learning_rate": 9.78805002315887e-07,
      "loss": 0.0001,
      "num_tokens": 62951848.0,
      "reward": 0.7742860317230225,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7742860317230225,
      "rewards/reward_func/std": 0.0,
      "step": 2289,
      "step_time": 26.346284467726946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 156.4375,
      "completions/mean_terminated_length": 156.4375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4017608165740967,
      "epoch": 0.10606762389995368,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015749339945614338,
      "kl": 0.0016024188371375203,
      "learning_rate": 9.78795738767948e-07,
      "loss": 0.0001,
      "num_tokens": 62988831.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2290,
      "step_time": 20.73379624262452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 134.125,
      "completions/mean_terminated_length": 134.125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3084176033735275,
      "epoch": 0.10611394163964799,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016018265159800649,
      "kl": 0.0014150924980640411,
      "learning_rate": 9.787864752200092e-07,
      "loss": 0.0001,
      "num_tokens": 63019921.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2291,
      "step_time": 16.553121391683817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 136.9375,
      "completions/mean_terminated_length": 136.9375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2522764801979065,
      "epoch": 0.10616025937934229,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0061822873540222645,
      "kl": 0.0022942226787563413,
      "learning_rate": 9.787772116720703e-07,
      "loss": 0.0001,
      "num_tokens": 63039648.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2292,
      "step_time": 14.715745452791452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 146.875,
      "completions/mean_terminated_length": 146.875,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.37816808372735977,
      "epoch": 0.1062065771190366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015219327760860324,
      "kl": 0.0018334352935198694,
      "learning_rate": 9.787679481241315e-07,
      "loss": 0.0001,
      "num_tokens": 63078798.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2293,
      "step_time": 20.435246918350458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 135.1875,
      "completions/mean_terminated_length": 135.1875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3749251514673233,
      "epoch": 0.10625289485873089,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003979141358286142,
      "kl": 0.0027734158793464303,
      "learning_rate": 9.787586845761926e-07,
      "loss": 0.0001,
      "num_tokens": 63108817.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2294,
      "step_time": 18.0811504162848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 140.375,
      "completions/mean_terminated_length": 140.375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.36283373087644577,
      "epoch": 0.1062992125984252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013740368885919452,
      "kl": 0.001440608175471425,
      "learning_rate": 9.787494210282537e-07,
      "loss": 0.0001,
      "num_tokens": 63132711.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2295,
      "step_time": 16.465324983000755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 155.3125,
      "completions/mean_terminated_length": 155.3125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.20616553351283073,
      "epoch": 0.1063455303381195,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.081479012966156,
      "kl": 0.0019770217477343976,
      "learning_rate": 9.787401574803148e-07,
      "loss": 0.0286,
      "num_tokens": 63157404.0,
      "reward": 0.856032133102417,
      "reward_std": 0.22827444970607758,
      "rewards/reward_func/mean": 0.856032133102417,
      "rewards/reward_func/std": 0.22827443480491638,
      "step": 2296,
      "step_time": 17.017949648201466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 102.0,
      "completions/mean_terminated_length": 102.0,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "entropy": 0.29436952620744705,
      "epoch": 0.10639184807781381,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002989846048876643,
      "kl": 0.0019666525186039507,
      "learning_rate": 9.787308939323762e-07,
      "loss": 0.0001,
      "num_tokens": 63180956.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2297,
      "step_time": 15.713763888925314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 191.375,
      "completions/mean_terminated_length": 191.375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.19966155290603638,
      "epoch": 0.1064381658175081,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08586151897907257,
      "kl": 0.0008249464735854417,
      "learning_rate": 9.787216303844373e-07,
      "loss": 0.0048,
      "num_tokens": 63214898.0,
      "reward": 0.9880691766738892,
      "reward_std": 0.04772331565618515,
      "rewards/reward_func/mean": 0.9880691766738892,
      "rewards/reward_func/std": 0.04772332310676575,
      "step": 2298,
      "step_time": 22.55654987320304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 352.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 222.0625,
      "completions/mean_terminated_length": 222.0625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.46618911623954773,
      "epoch": 0.10648448355720241,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006537908222526312,
      "kl": 0.0052906685741618276,
      "learning_rate": 9.787123668364984e-07,
      "loss": 0.0003,
      "num_tokens": 63239875.0,
      "reward": 3.869818243629197e-09,
      "reward_std": 8.242940552349864e-09,
      "rewards/reward_func/mean": 3.869818243629197e-09,
      "rewards/reward_func/std": 8.242940552349864e-09,
      "step": 2299,
      "step_time": 29.141530752182007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 208.8125,
      "completions/mean_terminated_length": 208.8125,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.2937890291213989,
      "epoch": 0.10653080129689671,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016014168504625559,
      "kl": 0.0014777045871596783,
      "learning_rate": 9.787031032885593e-07,
      "loss": 0.0001,
      "num_tokens": 63263088.0,
      "reward": 0.6897482872009277,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6897482872009277,
      "rewards/reward_func/std": 0.0,
      "step": 2300,
      "step_time": 20.025199435651302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 151.625,
      "completions/mean_terminated_length": 151.625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.37442679703235626,
      "epoch": 0.10657711903659102,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004565059673041105,
      "kl": 0.003417026367969811,
      "learning_rate": 9.786938397406207e-07,
      "loss": 0.0002,
      "num_tokens": 63284090.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2301,
      "step_time": 16.525586277246475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 201.0625,
      "completions/mean_terminated_length": 201.0625,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.1893659010529518,
      "epoch": 0.10662343677628532,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015991524560377002,
      "kl": 0.0013263034634292126,
      "learning_rate": 9.786845761926818e-07,
      "loss": 0.0001,
      "num_tokens": 63308315.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 2302,
      "step_time": 19.861064448952675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 116.0625,
      "completions/mean_terminated_length": 116.0625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2300916463136673,
      "epoch": 0.10666975451597963,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018000829732045531,
      "kl": 0.0013958195049781352,
      "learning_rate": 9.78675312644743e-07,
      "loss": 0.0001,
      "num_tokens": 63328508.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2303,
      "step_time": 13.659082971513271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 149.0625,
      "completions/mean_terminated_length": 149.0625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.2581302411854267,
      "epoch": 0.10671607225567392,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002336992183700204,
      "kl": 0.0014520465047098696,
      "learning_rate": 9.78666049096804e-07,
      "loss": 0.0001,
      "num_tokens": 63348797.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2304,
      "step_time": 15.82089039310813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 167.1875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4135226383805275,
      "epoch": 0.10676238999536823,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015398422256112099,
      "kl": 0.0017486756551079452,
      "learning_rate": 9.786567855488652e-07,
      "loss": 0.0001,
      "num_tokens": 63384304.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2305,
      "step_time": 21.010801576077938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 123.4375,
      "completions/mean_terminated_length": 123.4375,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.27658435702323914,
      "epoch": 0.10680870773506253,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003001737641170621,
      "kl": 0.0021387546439655125,
      "learning_rate": 9.786475220009263e-07,
      "loss": 0.0001,
      "num_tokens": 63404183.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2306,
      "step_time": 14.322453249245882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 153.5,
      "completions/mean_terminated_length": 153.5,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3153928518295288,
      "epoch": 0.10685502547475684,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008436080999672413,
      "kl": 0.004455897840671241,
      "learning_rate": 9.786382584529874e-07,
      "loss": 0.0002,
      "num_tokens": 63431343.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2307,
      "step_time": 18.271749652922153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 161.4375,
      "completions/mean_terminated_length": 161.4375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3811292424798012,
      "epoch": 0.10690134321445113,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030828353483229876,
      "kl": 0.002185209741583094,
      "learning_rate": 9.786289949050485e-07,
      "loss": 0.0001,
      "num_tokens": 63455734.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2308,
      "step_time": 18.74537069350481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 121.5625,
      "completions/mean_terminated_length": 121.5625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2895944267511368,
      "epoch": 0.10694766095414544,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013630129396915436,
      "kl": 0.001331931067397818,
      "learning_rate": 9.786197313571097e-07,
      "loss": 0.0001,
      "num_tokens": 63477439.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2309,
      "step_time": 14.01130260899663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 179.25,
      "completions/mean_terminated_length": 179.25,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.36147908866405487,
      "epoch": 0.10699397869383974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014920932007953525,
      "kl": 0.0014613425882998854,
      "learning_rate": 9.78610467809171e-07,
      "loss": 0.0001,
      "num_tokens": 63511107.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2310,
      "step_time": 22.073243718594313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 189.4375,
      "completions/mean_terminated_length": 189.4375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.26418977975845337,
      "epoch": 0.10704029643353405,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1080576553940773,
      "kl": 0.0015041398874018341,
      "learning_rate": 9.786012042612321e-07,
      "loss": -0.0444,
      "num_tokens": 63542954.0,
      "reward": 0.8950977325439453,
      "reward_std": 0.04094961658120155,
      "rewards/reward_func/mean": 0.8950977325439453,
      "rewards/reward_func/std": 0.040949635207653046,
      "step": 2311,
      "step_time": 22.31054286286235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 164.5,
      "completions/mean_terminated_length": 164.5,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.23768489435315132,
      "epoch": 0.10708661417322834,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09017661213874817,
      "kl": 0.0025725949089974165,
      "learning_rate": 9.78591940713293e-07,
      "loss": -0.0531,
      "num_tokens": 63572898.0,
      "reward": 0.9160261154174805,
      "reward_std": 0.027147367596626282,
      "rewards/reward_func/mean": 0.9160261154174805,
      "rewards/reward_func/std": 0.027147362008690834,
      "step": 2312,
      "step_time": 20.856951646506786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 181.8125,
      "completions/mean_terminated_length": 181.8125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.165035892277956,
      "epoch": 0.10713293191292265,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10406757891178131,
      "kl": 0.0022495368611998856,
      "learning_rate": 9.785826771653542e-07,
      "loss": -0.1167,
      "num_tokens": 63598703.0,
      "reward": 0.5950411558151245,
      "reward_std": 0.4749177396297455,
      "rewards/reward_func/mean": 0.5950411558151245,
      "rewards/reward_func/std": 0.47491776943206787,
      "step": 2313,
      "step_time": 19.663224138319492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 135.0625,
      "completions/mean_terminated_length": 135.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.2723177596926689,
      "epoch": 0.10717924965261695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016098006162792444,
      "kl": 0.0013433267595246434,
      "learning_rate": 9.785734136174155e-07,
      "loss": 0.0001,
      "num_tokens": 63619888.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2314,
      "step_time": 14.279137838631868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 196.3125,
      "completions/mean_terminated_length": 196.3125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.39508721977472305,
      "epoch": 0.10722556739231126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002097412943840027,
      "kl": 0.0024874747614376247,
      "learning_rate": 9.785641500694766e-07,
      "loss": 0.0001,
      "num_tokens": 63655765.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2315,
      "step_time": 23.132310081273317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 223.875,
      "completions/mean_terminated_length": 223.875,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "entropy": 0.19757945090532303,
      "epoch": 0.10727188513200556,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11243237555027008,
      "kl": 0.0047940564109012485,
      "learning_rate": 9.785548865215377e-07,
      "loss": -0.0059,
      "num_tokens": 63694163.0,
      "reward": 0.35405808687210083,
      "reward_std": 0.014812404289841652,
      "rewards/reward_func/mean": 0.35405808687210083,
      "rewards/reward_func/std": 0.014812405221164227,
      "step": 2316,
      "step_time": 25.012777283787727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 151.5625,
      "completions/mean_terminated_length": 151.5625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3269631415605545,
      "epoch": 0.10731820287169987,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002156304894015193,
      "kl": 0.0016005643119569868,
      "learning_rate": 9.785456229735989e-07,
      "loss": 0.0001,
      "num_tokens": 63716812.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2317,
      "step_time": 17.062161348760128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 192.6875,
      "completions/mean_terminated_length": 192.6875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2712355703115463,
      "epoch": 0.10736452061139416,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011392252519726753,
      "kl": 0.00468895654194057,
      "learning_rate": 9.7853635942566e-07,
      "loss": 0.0002,
      "num_tokens": 63740199.0,
      "reward": 0.3381999135017395,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3381999135017395,
      "rewards/reward_func/std": 0.0,
      "step": 2318,
      "step_time": 20.61536430567503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 166.75,
      "completions/mean_terminated_length": 166.75,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.1931329183280468,
      "epoch": 0.10741083835108847,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016985037364065647,
      "kl": 0.0012550139799714088,
      "learning_rate": 9.785270958777211e-07,
      "loss": 0.0001,
      "num_tokens": 63762211.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2319,
      "step_time": 18.47048844769597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 129.375,
      "completions/mean_terminated_length": 129.375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3037542253732681,
      "epoch": 0.10745715609078277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018543946789577603,
      "kl": 0.001508870889665559,
      "learning_rate": 9.785178323297822e-07,
      "loss": 0.0001,
      "num_tokens": 63788681.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2320,
      "step_time": 16.705736588686705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 155.5,
      "completions/mean_terminated_length": 155.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.40432457625865936,
      "epoch": 0.10750347383047708,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019889730028808117,
      "kl": 0.002315671998076141,
      "learning_rate": 9.785085687818434e-07,
      "loss": 0.0001,
      "num_tokens": 63845745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2321,
      "step_time": 25.2012220621109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 202.3125,
      "completions/mean_terminated_length": 202.3125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3099028021097183,
      "epoch": 0.10754979157017137,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11800023168325424,
      "kl": 0.006881332607008517,
      "learning_rate": 9.784993052339045e-07,
      "loss": -0.1077,
      "num_tokens": 63869734.0,
      "reward": 0.5659927129745483,
      "reward_std": 0.4527941644191742,
      "rewards/reward_func/mean": 0.5659927129745483,
      "rewards/reward_func/std": 0.4527941942214966,
      "step": 2322,
      "step_time": 21.946988113224506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 167.375,
      "completions/mean_terminated_length": 167.375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.26356247067451477,
      "epoch": 0.10759610930986568,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12838982045650482,
      "kl": 0.0028781691507901996,
      "learning_rate": 9.784900416859656e-07,
      "loss": 0.021,
      "num_tokens": 63894332.0,
      "reward": 0.33142462372779846,
      "reward_std": 0.01225903071463108,
      "rewards/reward_func/mean": 0.33142462372779846,
      "rewards/reward_func/std": 0.012259027920663357,
      "step": 2323,
      "step_time": 17.727836210280657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 152.625,
      "completions/mean_terminated_length": 152.625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.20181596651673317,
      "epoch": 0.10764242704955998,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008901465684175491,
      "kl": 0.0038753908302169293,
      "learning_rate": 9.78480778138027e-07,
      "loss": 0.0002,
      "num_tokens": 63916886.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2324,
      "step_time": 15.363601807504892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.33088518679142,
      "epoch": 0.10768874478925429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030767915304750204,
      "kl": 0.002382044738624245,
      "learning_rate": 9.784715145900879e-07,
      "loss": 0.0001,
      "num_tokens": 63938065.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2325,
      "step_time": 15.904967341572046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 140.8125,
      "completions/mean_terminated_length": 140.8125,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.15863429754972458,
      "epoch": 0.10773506252894859,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15959049761295319,
      "kl": 0.0014423929387703538,
      "learning_rate": 9.78462251042149e-07,
      "loss": 0.0282,
      "num_tokens": 63961582.0,
      "reward": 0.8296091556549072,
      "reward_std": 0.11370529979467392,
      "rewards/reward_func/mean": 0.8296091556549072,
      "rewards/reward_func/std": 0.11370529979467392,
      "step": 2326,
      "step_time": 15.291508000344038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 144.4375,
      "completions/mean_terminated_length": 144.4375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.32636523991823196,
      "epoch": 0.1077813802686429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028073210269212723,
      "kl": 0.0021235549356788397,
      "learning_rate": 9.784529874942103e-07,
      "loss": 0.0001,
      "num_tokens": 63981909.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2327,
      "step_time": 17.106442864984274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 199.0625,
      "completions/mean_terminated_length": 199.0625,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.31194355338811874,
      "epoch": 0.10782769800833719,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028667158912867308,
      "kl": 0.0023261773167178035,
      "learning_rate": 9.784437239462715e-07,
      "loss": 0.0001,
      "num_tokens": 64008758.0,
      "reward": 0.27390056848526,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.27390056848526,
      "rewards/reward_func/std": 0.0,
      "step": 2328,
      "step_time": 21.130776807665825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 323.0,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 202.0625,
      "completions/mean_terminated_length": 202.0625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.3795570209622383,
      "epoch": 0.1078740157480315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0055798115208745,
      "kl": 0.0038282829336822033,
      "learning_rate": 9.784344603983326e-07,
      "loss": 0.0002,
      "num_tokens": 64035367.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2329,
      "step_time": 27.652083162218332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 208.875,
      "completions/mean_terminated_length": 208.875,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.22390004619956017,
      "epoch": 0.1079203334877258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024152437690645456,
      "kl": 0.0018542756733950227,
      "learning_rate": 9.784251968503937e-07,
      "loss": 0.0001,
      "num_tokens": 64062421.0,
      "reward": 0.786984384059906,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.786984384059906,
      "rewards/reward_func/std": 0.0,
      "step": 2330,
      "step_time": 22.75635538250208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 144.5625,
      "completions/mean_terminated_length": 144.5625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2623891681432724,
      "epoch": 0.10796665122742011,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001554641523398459,
      "kl": 0.0014957803941797465,
      "learning_rate": 9.784159333024548e-07,
      "loss": 0.0001,
      "num_tokens": 64082382.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2331,
      "step_time": 14.557039085775614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 137.6875,
      "completions/mean_terminated_length": 137.6875,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3514051139354706,
      "epoch": 0.1080129689671144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029933264013379812,
      "kl": 0.0021869066986255348,
      "learning_rate": 9.78406669754516e-07,
      "loss": 0.0001,
      "num_tokens": 64108569.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2332,
      "step_time": 15.876485411077738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 169.75,
      "completions/mean_terminated_length": 169.75,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4565359354019165,
      "epoch": 0.10805928670680871,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034759847912937403,
      "kl": 0.0026159347617067397,
      "learning_rate": 9.78397406206577e-07,
      "loss": 0.0001,
      "num_tokens": 64130037.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2333,
      "step_time": 17.716689959168434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 202.1875,
      "completions/mean_terminated_length": 202.1875,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.23511254787445068,
      "epoch": 0.10810560444650301,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034925348591059446,
      "kl": 0.002422249934170395,
      "learning_rate": 9.783881426586382e-07,
      "loss": 0.0001,
      "num_tokens": 64155784.0,
      "reward": 0.06856315582990646,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.06856315582990646,
      "rewards/reward_func/std": 0.0,
      "step": 2334,
      "step_time": 20.25643503293395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 116.0,
      "completions/mean_terminated_length": 116.0,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2809803709387779,
      "epoch": 0.10815192218619732,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016170106828212738,
      "kl": 0.00152270492981188,
      "learning_rate": 9.783788791106993e-07,
      "loss": 0.0001,
      "num_tokens": 64176968.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2335,
      "step_time": 13.492836754769087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 135.6875,
      "completions/mean_terminated_length": 135.6875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.28990789502859116,
      "epoch": 0.10819823992589161,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017747610108926892,
      "kl": 0.0013901473430451006,
      "learning_rate": 9.783696155627605e-07,
      "loss": 0.0001,
      "num_tokens": 64197459.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2336,
      "step_time": 15.477379083633423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 129.9375,
      "completions/mean_terminated_length": 129.9375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2344549000263214,
      "epoch": 0.10824455766558592,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0044554611667990685,
      "kl": 0.0020847307168878615,
      "learning_rate": 9.783603520148216e-07,
      "loss": 0.0001,
      "num_tokens": 64217042.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2337,
      "step_time": 15.261674121022224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 126.9375,
      "completions/mean_terminated_length": 126.9375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.26227881759405136,
      "epoch": 0.10829087540528022,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007790118455886841,
      "kl": 0.0023844182142056525,
      "learning_rate": 9.783510884668827e-07,
      "loss": 0.0001,
      "num_tokens": 64238065.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2338,
      "step_time": 14.55690012872219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 143.25,
      "completions/mean_terminated_length": 143.25,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.23508936539292336,
      "epoch": 0.10833719314497453,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15546344220638275,
      "kl": 0.007544756750576198,
      "learning_rate": 9.783418249189438e-07,
      "loss": 0.0474,
      "num_tokens": 64260229.0,
      "reward": 0.4911891222000122,
      "reward_std": 0.14480088651180267,
      "rewards/reward_func/mean": 0.4911891222000122,
      "rewards/reward_func/std": 0.14480090141296387,
      "step": 2339,
      "step_time": 16.440475221723318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 143.6875,
      "completions/mean_terminated_length": 143.6875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.2491108849644661,
      "epoch": 0.10838351088466883,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017591416835784912,
      "kl": 0.0014814950700383633,
      "learning_rate": 9.78332561371005e-07,
      "loss": 0.0001,
      "num_tokens": 64280192.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2340,
      "step_time": 14.446493964642286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 252.25,
      "completions/mean_terminated_length": 252.25,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "entropy": 0.18401716277003288,
      "epoch": 0.10842982862436314,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024554389528930187,
      "kl": 0.001874446781584993,
      "learning_rate": 9.783232978230663e-07,
      "loss": 0.0001,
      "num_tokens": 64309972.0,
      "reward": 0.9813089370727539,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9813089370727539,
      "rewards/reward_func/std": 0.0,
      "step": 2341,
      "step_time": 24.967931482940912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 157.5,
      "completions/mean_terminated_length": 157.5,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.41685056686401367,
      "epoch": 0.10847614636405743,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021794848144054413,
      "kl": 0.002308505936525762,
      "learning_rate": 9.783140342751274e-07,
      "loss": 0.0001,
      "num_tokens": 64362252.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2342,
      "step_time": 24.696660231798887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 126.875,
      "completions/mean_terminated_length": 126.875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2645728290081024,
      "epoch": 0.10852246410375174,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020788901019841433,
      "kl": 0.0015851175121497363,
      "learning_rate": 9.783047707271883e-07,
      "loss": 0.0001,
      "num_tokens": 64382746.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2343,
      "step_time": 14.665306013077497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 167.125,
      "completions/mean_terminated_length": 167.125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3263947293162346,
      "epoch": 0.10856878184344604,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10520664602518082,
      "kl": 0.004593534977175295,
      "learning_rate": 9.782955071792497e-07,
      "loss": -0.002,
      "num_tokens": 64403372.0,
      "reward": 0.7351804375648499,
      "reward_std": 0.3647516369819641,
      "rewards/reward_func/mean": 0.7351804375648499,
      "rewards/reward_func/std": 0.3647516369819641,
      "step": 2344,
      "step_time": 19.894864667207003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 337.0,
      "completions/max_terminated_length": 337.0,
      "completions/mean_length": 252.0625,
      "completions/mean_terminated_length": 252.0625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.34028272330760956,
      "epoch": 0.10861509958314035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10743195563554764,
      "kl": 0.005560745019465685,
      "learning_rate": 9.782862436313108e-07,
      "loss": -0.1802,
      "num_tokens": 64430701.0,
      "reward": 0.6157898306846619,
      "reward_std": 0.49269482493400574,
      "rewards/reward_func/mean": 0.6157898306846619,
      "rewards/reward_func/std": 0.4926948547363281,
      "step": 2345,
      "step_time": 28.747718140482903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 160.9375,
      "completions/mean_terminated_length": 160.9375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.1433638110756874,
      "epoch": 0.10866141732283464,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004954099655151367,
      "kl": 0.003796221222728491,
      "learning_rate": 9.78276980083372e-07,
      "loss": 0.0002,
      "num_tokens": 64455820.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2346,
      "step_time": 16.379356395453215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 183.25,
      "completions/mean_terminated_length": 183.25,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.38788267970085144,
      "epoch": 0.10870773506252895,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036784771364182234,
      "kl": 0.0028724872681777924,
      "learning_rate": 9.78267716535433e-07,
      "loss": 0.0001,
      "num_tokens": 64478704.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2347,
      "step_time": 19.262580774724483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 328.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 233.125,
      "completions/mean_terminated_length": 233.125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.29155682027339935,
      "epoch": 0.10875405280222325,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09992647916078568,
      "kl": 0.004710135166533291,
      "learning_rate": 9.782584529874942e-07,
      "loss": -0.1581,
      "num_tokens": 64503282.0,
      "reward": 0.568049430847168,
      "reward_std": 0.2834235429763794,
      "rewards/reward_func/mean": 0.568049430847168,
      "rewards/reward_func/std": 0.2834235429763794,
      "step": 2348,
      "step_time": 27.2289629727602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 192.25,
      "completions/mean_terminated_length": 192.25,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.27687399834394455,
      "epoch": 0.10880037054191756,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10451581329107285,
      "kl": 0.004878342500887811,
      "learning_rate": 9.782491894395553e-07,
      "loss": -0.0744,
      "num_tokens": 64540934.0,
      "reward": 0.5664272904396057,
      "reward_std": 0.16924944519996643,
      "rewards/reward_func/mean": 0.5664272904396057,
      "rewards/reward_func/std": 0.16924946010112762,
      "step": 2349,
      "step_time": 24.839217126369476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 172.4375,
      "completions/mean_terminated_length": 172.4375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.20761826634407043,
      "epoch": 0.10884668828161186,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11507759988307953,
      "kl": 0.0012529796367743984,
      "learning_rate": 9.782399258916164e-07,
      "loss": -0.0554,
      "num_tokens": 64566013.0,
      "reward": 0.38113081455230713,
      "reward_std": 0.409390389919281,
      "rewards/reward_func/mean": 0.38113081455230713,
      "rewards/reward_func/std": 0.4093904197216034,
      "step": 2350,
      "step_time": 19.725036844611168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 426.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 333.375,
      "completions/mean_terminated_length": 333.375,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "entropy": 0.23016024380922318,
      "epoch": 0.10889300602130617,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06802016496658325,
      "kl": 0.0021586233924608678,
      "learning_rate": 9.782306623436775e-07,
      "loss": -0.0762,
      "num_tokens": 64607235.0,
      "reward": 0.5437136888504028,
      "reward_std": 0.24644720554351807,
      "rewards/reward_func/mean": 0.5437136888504028,
      "rewards/reward_func/std": 0.24644720554351807,
      "step": 2351,
      "step_time": 37.613715037703514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 174.4375,
      "completions/mean_terminated_length": 174.4375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.23183806240558624,
      "epoch": 0.10893932376100046,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020201504230499268,
      "kl": 0.004952602321282029,
      "learning_rate": 9.782213987957387e-07,
      "loss": 0.0003,
      "num_tokens": 64655786.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 2352,
      "step_time": 29.950926713645458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 135.1875,
      "completions/mean_terminated_length": 135.1875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.27789459377527237,
      "epoch": 0.10898564150069477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016213261988013983,
      "kl": 0.0014960351691115648,
      "learning_rate": 9.782121352477998e-07,
      "loss": 0.0001,
      "num_tokens": 64683885.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2353,
      "step_time": 15.662431389093399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 140.6875,
      "completions/mean_terminated_length": 140.6875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.2667175978422165,
      "epoch": 0.10903195924038907,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033754301257431507,
      "kl": 0.0020495178760029376,
      "learning_rate": 9.782028716998611e-07,
      "loss": 0.0001,
      "num_tokens": 64704200.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2354,
      "step_time": 15.436271790415049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 326.0,
      "completions/max_terminated_length": 326.0,
      "completions/mean_length": 225.4375,
      "completions/mean_terminated_length": 225.4375,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.4056299179792404,
      "epoch": 0.10907827698008338,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11347091197967529,
      "kl": 0.004436219925992191,
      "learning_rate": 9.78193608151922e-07,
      "loss": -0.1563,
      "num_tokens": 64730495.0,
      "reward": 0.11520224809646606,
      "reward_std": 0.2597229778766632,
      "rewards/reward_func/mean": 0.11520224809646606,
      "rewards/reward_func/std": 0.2597229778766632,
      "step": 2355,
      "step_time": 27.59665833041072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 126.9375,
      "completions/mean_terminated_length": 126.9375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.21263223141431808,
      "epoch": 0.10912459471977767,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013671000488102436,
      "kl": 0.0021115583658684045,
      "learning_rate": 9.781843446039832e-07,
      "loss": 0.0001,
      "num_tokens": 64751278.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2356,
      "step_time": 14.197697196155787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 182.125,
      "completions/mean_terminated_length": 182.125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3813173770904541,
      "epoch": 0.10917091245947198,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005697739310562611,
      "kl": 0.0038346676155924797,
      "learning_rate": 9.781750810560445e-07,
      "loss": 0.0002,
      "num_tokens": 64774592.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2357,
      "step_time": 19.704799972474575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 231.9375,
      "completions/mean_terminated_length": 231.9375,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.3623291403055191,
      "epoch": 0.10921723019916628,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07882294058799744,
      "kl": 0.004046881454996765,
      "learning_rate": 9.781658175081056e-07,
      "loss": -0.0466,
      "num_tokens": 64797807.0,
      "reward": 0.6959699988365173,
      "reward_std": 0.4154508113861084,
      "rewards/reward_func/mean": 0.6959699988365173,
      "rewards/reward_func/std": 0.4154508411884308,
      "step": 2358,
      "step_time": 23.98015521466732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 148.1875,
      "completions/mean_terminated_length": 148.1875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.27642325311899185,
      "epoch": 0.10926354793886059,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002546600066125393,
      "kl": 0.0018306243291590363,
      "learning_rate": 9.781565539601668e-07,
      "loss": 0.0001,
      "num_tokens": 64833090.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2359,
      "step_time": 19.557082820683718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 145.625,
      "completions/mean_terminated_length": 145.625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.24291365966200829,
      "epoch": 0.10930986567855489,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027510575018823147,
      "kl": 0.0019136813352815807,
      "learning_rate": 9.781472904122279e-07,
      "loss": 0.0001,
      "num_tokens": 64852732.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2360,
      "step_time": 15.28828652203083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 331.0,
      "completions/max_terminated_length": 331.0,
      "completions/mean_length": 222.75,
      "completions/mean_terminated_length": 222.75,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.2334187850356102,
      "epoch": 0.1093561834182492,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07622101902961731,
      "kl": 0.002250398480100557,
      "learning_rate": 9.78138026864289e-07,
      "loss": -0.1944,
      "num_tokens": 64878696.0,
      "reward": 0.13393382728099823,
      "reward_std": 0.27229103446006775,
      "rewards/reward_func/mean": 0.13393382728099823,
      "rewards/reward_func/std": 0.27229103446006775,
      "step": 2361,
      "step_time": 27.91884146258235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 279.0,
      "completions/max_terminated_length": 279.0,
      "completions/mean_length": 210.4375,
      "completions/mean_terminated_length": 210.4375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3545868992805481,
      "epoch": 0.10940250115794349,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12284550070762634,
      "kl": 0.0036865408183075488,
      "learning_rate": 9.781287633163501e-07,
      "loss": -0.1176,
      "num_tokens": 64909599.0,
      "reward": 0.46391788125038147,
      "reward_std": 0.39168205857276917,
      "rewards/reward_func/mean": 0.46391788125038147,
      "rewards/reward_func/std": 0.39168205857276917,
      "step": 2362,
      "step_time": 25.77095314115286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 211.75,
      "completions/mean_terminated_length": 211.75,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.26528290659189224,
      "epoch": 0.1094488188976378,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00298228464089334,
      "kl": 0.0026924379053525627,
      "learning_rate": 9.781194997684113e-07,
      "loss": 0.0001,
      "num_tokens": 64931131.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2363,
      "step_time": 21.232171565294266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 193.8125,
      "completions/mean_terminated_length": 193.8125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.3926021009683609,
      "epoch": 0.1094951366373321,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027155806310474873,
      "kl": 0.001936954795382917,
      "learning_rate": 9.781102362204724e-07,
      "loss": 0.0001,
      "num_tokens": 64957560.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2364,
      "step_time": 20.97134505584836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 189.0625,
      "completions/mean_terminated_length": 189.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.2027004174888134,
      "epoch": 0.1095414543770264,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018587826052680612,
      "kl": 0.0013235746591817588,
      "learning_rate": 9.781009726725335e-07,
      "loss": 0.0001,
      "num_tokens": 64988777.0,
      "reward": 0.3487522304058075,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3487522304058075,
      "rewards/reward_func/std": 0.0,
      "step": 2365,
      "step_time": 22.576804656535387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 191.6875,
      "completions/mean_terminated_length": 191.6875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3604848012328148,
      "epoch": 0.1095877721167207,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009871640242636204,
      "kl": 0.004019507789053023,
      "learning_rate": 9.780917091245946e-07,
      "loss": 0.0002,
      "num_tokens": 65017908.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2366,
      "step_time": 21.157035641372204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 196.5,
      "completions/mean_terminated_length": 196.5,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3952259421348572,
      "epoch": 0.10963408985641501,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09334277361631393,
      "kl": 0.004445242928341031,
      "learning_rate": 9.78082445576656e-07,
      "loss": 0.0072,
      "num_tokens": 65065388.0,
      "reward": 0.3393140435218811,
      "reward_std": 0.4524186849594116,
      "rewards/reward_func/mean": 0.3393140435218811,
      "rewards/reward_func/std": 0.4524187445640564,
      "step": 2367,
      "step_time": 26.198620542883873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 182.5625,
      "completions/mean_terminated_length": 182.5625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.29002469778060913,
      "epoch": 0.10968040759610931,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10416804254055023,
      "kl": 0.006405917811207473,
      "learning_rate": 9.780731820287169e-07,
      "loss": 0.0914,
      "num_tokens": 65095109.0,
      "reward": 0.7475360631942749,
      "reward_std": 0.3708817660808563,
      "rewards/reward_func/mean": 0.7475360631942749,
      "rewards/reward_func/std": 0.3708817660808563,
      "step": 2368,
      "step_time": 24.547886081039906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 169.3125,
      "completions/mean_terminated_length": 169.3125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.2909696698188782,
      "epoch": 0.10972672533580362,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005206348840147257,
      "kl": 0.0033329512807540596,
      "learning_rate": 9.78063918480778e-07,
      "loss": 0.0002,
      "num_tokens": 65117210.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2369,
      "step_time": 18.68805754557252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 182.1875,
      "completions/mean_terminated_length": 182.1875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.33839114010334015,
      "epoch": 0.10977304307549791,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10179704427719116,
      "kl": 0.004062555148266256,
      "learning_rate": 9.780546549328391e-07,
      "loss": -0.033,
      "num_tokens": 65140477.0,
      "reward": 0.06933650374412537,
      "reward_std": 0.08119859546422958,
      "rewards/reward_func/mean": 0.06933650374412537,
      "rewards/reward_func/std": 0.08119859546422958,
      "step": 2370,
      "step_time": 23.381348200142384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 112.1875,
      "completions/mean_terminated_length": 112.1875,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.2650522142648697,
      "epoch": 0.10981936081519222,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00396697735413909,
      "kl": 0.0021386328153312206,
      "learning_rate": 9.780453913849005e-07,
      "loss": 0.0001,
      "num_tokens": 65160944.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2371,
      "step_time": 12.611635141074657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 180.75,
      "completions/mean_terminated_length": 180.75,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.33697984367609024,
      "epoch": 0.10986567855488652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00687724445015192,
      "kl": 0.004857703344896436,
      "learning_rate": 9.780361278369616e-07,
      "loss": 0.0002,
      "num_tokens": 65189788.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2372,
      "step_time": 20.00566278770566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 177.9375,
      "completions/mean_terminated_length": 177.9375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3338945657014847,
      "epoch": 0.10991199629458083,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11698485165834427,
      "kl": 0.0042671922710724175,
      "learning_rate": 9.780268642890227e-07,
      "loss": -0.1617,
      "num_tokens": 65212843.0,
      "reward": 0.22983068227767944,
      "reward_std": 0.41113361716270447,
      "rewards/reward_func/mean": 0.22983068227767944,
      "rewards/reward_func/std": 0.41113361716270447,
      "step": 2373,
      "step_time": 22.974309355020523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 174.1875,
      "completions/mean_terminated_length": 174.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.35960356146097183,
      "epoch": 0.10995831403427513,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00192203838378191,
      "kl": 0.001703375281067565,
      "learning_rate": 9.780176007410838e-07,
      "loss": 0.0001,
      "num_tokens": 65235086.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2374,
      "step_time": 19.51780268922448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 158.4375,
      "completions/mean_terminated_length": 158.4375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.330436110496521,
      "epoch": 0.11000463177396944,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004185125697404146,
      "kl": 0.0025320067070424557,
      "learning_rate": 9.78008337193145e-07,
      "loss": 0.0001,
      "num_tokens": 65259797.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2375,
      "step_time": 16.79103474318981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 180.25,
      "completions/mean_terminated_length": 180.25,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.41182051599025726,
      "epoch": 0.11005094951366373,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00350063294172287,
      "kl": 0.002961619582492858,
      "learning_rate": 9.77999073645206e-07,
      "loss": 0.0001,
      "num_tokens": 65289289.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2376,
      "step_time": 19.341465838253498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 168.125,
      "completions/mean_terminated_length": 168.125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.20241046324372292,
      "epoch": 0.11009726725335804,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003771762130782008,
      "kl": 0.002782963332720101,
      "learning_rate": 9.779898100972672e-07,
      "loss": 0.0001,
      "num_tokens": 65314155.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2377,
      "step_time": 18.157934233546257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 129.875,
      "completions/mean_terminated_length": 129.875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.24889836087822914,
      "epoch": 0.11014358499305234,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004847789648920298,
      "kl": 0.0024113141116686165,
      "learning_rate": 9.779805465493283e-07,
      "loss": 0.0001,
      "num_tokens": 65333673.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2378,
      "step_time": 15.077660147100687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 137.75,
      "completions/mean_terminated_length": 137.75,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.330291286110878,
      "epoch": 0.11018990273274665,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019224765710532665,
      "kl": 0.00165868503972888,
      "learning_rate": 9.779712830013895e-07,
      "loss": 0.0001,
      "num_tokens": 65362805.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2379,
      "step_time": 16.411409467458725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 150.9375,
      "completions/mean_terminated_length": 150.9375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.24974070861935616,
      "epoch": 0.11023622047244094,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004468156490474939,
      "kl": 0.0033513674279674888,
      "learning_rate": 9.779620194534506e-07,
      "loss": 0.0002,
      "num_tokens": 65399492.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2380,
      "step_time": 20.17722300067544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 295.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 254.875,
      "completions/mean_terminated_length": 254.875,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.24785011261701584,
      "epoch": 0.11028253821213525,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0666511207818985,
      "kl": 0.005058192473370582,
      "learning_rate": 9.779527559055117e-07,
      "loss": -0.0315,
      "num_tokens": 65422578.0,
      "reward": 0.9131531715393066,
      "reward_std": 0.22993049025535583,
      "rewards/reward_func/mean": 0.9131531715393066,
      "rewards/reward_func/std": 0.22993049025535583,
      "step": 2381,
      "step_time": 24.465677250176668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 127.375,
      "completions/mean_terminated_length": 127.375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.3378248065710068,
      "epoch": 0.11032885595182955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004500414710491896,
      "kl": 0.003168799390550703,
      "learning_rate": 9.779434923575728e-07,
      "loss": 0.0002,
      "num_tokens": 65442472.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2382,
      "step_time": 14.456896986812353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 166.875,
      "completions/mean_terminated_length": 166.875,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.16185901314020157,
      "epoch": 0.11037517369152386,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003626069752499461,
      "kl": 0.0024742622044868767,
      "learning_rate": 9.77934228809634e-07,
      "loss": 0.0001,
      "num_tokens": 65465558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2383,
      "step_time": 17.839644107967615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 168.375,
      "completions/mean_terminated_length": 168.375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.41052452474832535,
      "epoch": 0.11042149143121816,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013905029045417905,
      "kl": 0.0019480255723465234,
      "learning_rate": 9.779249652616953e-07,
      "loss": 0.0001,
      "num_tokens": 65529276.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2384,
      "step_time": 30.762097127735615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 112.0,
      "completions/mean_terminated_length": 112.0,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.2557000517845154,
      "epoch": 0.11046780917091246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001526663196273148,
      "kl": 0.001381642243359238,
      "learning_rate": 9.779157017137564e-07,
      "loss": 0.0001,
      "num_tokens": 65548572.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2385,
      "step_time": 12.958014130592346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 156.875,
      "completions/mean_terminated_length": 156.875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.37063341587781906,
      "epoch": 0.11051412691060676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003017621347680688,
      "kl": 0.0026910093147307634,
      "learning_rate": 9.779064381658173e-07,
      "loss": 0.0001,
      "num_tokens": 65573210.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2386,
      "step_time": 17.186774775385857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.21390212327241898,
      "epoch": 0.11056044465030107,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032537176739424467,
      "kl": 0.0016970251745078713,
      "learning_rate": 9.778971746178787e-07,
      "loss": 0.0001,
      "num_tokens": 65592981.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2387,
      "step_time": 15.822840303182602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 144.125,
      "completions/mean_terminated_length": 144.125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.43736421316862106,
      "epoch": 0.11060676238999537,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025537472683936357,
      "kl": 0.0021149092353880405,
      "learning_rate": 9.778879110699398e-07,
      "loss": 0.0001,
      "num_tokens": 65623735.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2388,
      "step_time": 18.052705015987158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 153.625,
      "completions/mean_terminated_length": 153.625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.43385789543390274,
      "epoch": 0.11065308012968968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003613361855968833,
      "kl": 0.002479005604982376,
      "learning_rate": 9.77878647522001e-07,
      "loss": 0.0001,
      "num_tokens": 65669201.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2389,
      "step_time": 21.727573167532682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 346.0,
      "completions/max_terminated_length": 346.0,
      "completions/mean_length": 256.9375,
      "completions/mean_terminated_length": 256.9375,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "entropy": 0.25939323380589485,
      "epoch": 0.11069939786938397,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08898063749074936,
      "kl": 0.003215736534912139,
      "learning_rate": 9.77869383974062e-07,
      "loss": -0.0335,
      "num_tokens": 65707808.0,
      "reward": 0.9522287249565125,
      "reward_std": 0.06066947430372238,
      "rewards/reward_func/mean": 0.9522287249565125,
      "rewards/reward_func/std": 0.06066947802901268,
      "step": 2390,
      "step_time": 32.65638582408428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 123.875,
      "completions/mean_terminated_length": 123.875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.29747195541858673,
      "epoch": 0.11074571560907828,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026461619418114424,
      "kl": 0.0016994259494822472,
      "learning_rate": 9.778601204261232e-07,
      "loss": 0.0001,
      "num_tokens": 65728062.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2391,
      "step_time": 14.331314660608768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 173.1875,
      "completions/mean_terminated_length": 173.1875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.18173721432685852,
      "epoch": 0.11079203334877258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010064563248306513,
      "kl": 0.0008620497246738523,
      "learning_rate": 9.778508568781843e-07,
      "loss": 0.0,
      "num_tokens": 65762993.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2392,
      "step_time": 20.52535917982459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 150.875,
      "completions/mean_terminated_length": 150.875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.20904475450515747,
      "epoch": 0.11083835108846689,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004252149257808924,
      "kl": 0.003123885951936245,
      "learning_rate": 9.778415933302454e-07,
      "loss": 0.0002,
      "num_tokens": 65785327.0,
      "reward": 0.3219582736492157,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3219582736492157,
      "rewards/reward_func/std": 0.0,
      "step": 2393,
      "step_time": 16.25042412057519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 141.3125,
      "completions/mean_terminated_length": 141.3125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2977365180850029,
      "epoch": 0.11088466882816118,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024261344224214554,
      "kl": 0.0018466077744960785,
      "learning_rate": 9.778323297823065e-07,
      "loss": 0.0001,
      "num_tokens": 65807556.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2394,
      "step_time": 15.130787659436464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 137.625,
      "completions/mean_terminated_length": 137.625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3205549642443657,
      "epoch": 0.1109309865678555,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005907420068979263,
      "kl": 0.0032924521365202963,
      "learning_rate": 9.778230662343677e-07,
      "loss": 0.0002,
      "num_tokens": 65828078.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2395,
      "step_time": 15.151044774800539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 174.8125,
      "completions/mean_terminated_length": 174.8125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.39984411001205444,
      "epoch": 0.11097730430754979,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008461474440991879,
      "kl": 0.005715583451092243,
      "learning_rate": 9.778138026864288e-07,
      "loss": 0.0003,
      "num_tokens": 65850715.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2396,
      "step_time": 17.606170926243067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 242.3125,
      "completions/mean_terminated_length": 242.3125,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "entropy": 0.24380479007959366,
      "epoch": 0.1110236220472441,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033474103547632694,
      "kl": 0.002408111817203462,
      "learning_rate": 9.778045391384901e-07,
      "loss": 0.0001,
      "num_tokens": 65876736.0,
      "reward": 0.7326324582099915,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7326324582099915,
      "rewards/reward_func/std": 0.0,
      "step": 2397,
      "step_time": 24.573301322758198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 110.3125,
      "completions/mean_terminated_length": 110.3125,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.26128362491726875,
      "epoch": 0.1110699397869384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005125187337398529,
      "kl": 0.0023191372747533023,
      "learning_rate": 9.77795275590551e-07,
      "loss": 0.0001,
      "num_tokens": 65897621.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2398,
      "step_time": 12.474109884351492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3005419969558716,
      "epoch": 0.1111162575266327,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007298790384083986,
      "kl": 0.002545641007600352,
      "learning_rate": 9.777860120426122e-07,
      "loss": 0.0029,
      "num_tokens": 65918370.0,
      "reward": 1.9202496332582086e-05,
      "reward_std": 7.4958870754926465e-06,
      "rewards/reward_func/mean": 1.9202496332582086e-05,
      "rewards/reward_func/std": 7.4958875302399974e-06,
      "step": 2399,
      "step_time": 17.20006264746189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 115.75,
      "completions/mean_terminated_length": 115.75,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2119411677122116,
      "epoch": 0.111162575266327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017727614613249898,
      "kl": 0.0012794640206266195,
      "learning_rate": 9.777767484946733e-07,
      "loss": 0.0001,
      "num_tokens": 65937806.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2400,
      "step_time": 13.171920608729124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 183.875,
      "completions/mean_terminated_length": 183.875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.24039685726165771,
      "epoch": 0.11120889300602131,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002546001924201846,
      "kl": 0.0018284392426721752,
      "learning_rate": 9.777674849467346e-07,
      "loss": 0.0001,
      "num_tokens": 65965852.0,
      "reward": 0.7788007855415344,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7788007855415344,
      "rewards/reward_func/std": 0.0,
      "step": 2401,
      "step_time": 20.73470561951399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.2099083736538887,
      "epoch": 0.11125521074571561,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013738662004470825,
      "kl": 0.008505105972290039,
      "learning_rate": 9.777582213987958e-07,
      "loss": 0.0004,
      "num_tokens": 65988043.0,
      "reward": 0.14145326614379883,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.14145326614379883,
      "rewards/reward_func/std": 0.0,
      "step": 2402,
      "step_time": 17.317905079573393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 130.0,
      "completions/mean_terminated_length": 130.0,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3355402946472168,
      "epoch": 0.11130152848540992,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021074803080409765,
      "kl": 0.001680780842434615,
      "learning_rate": 9.777489578508569e-07,
      "loss": 0.0001,
      "num_tokens": 66024155.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2403,
      "step_time": 18.197333835065365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 117.8125,
      "completions/mean_terminated_length": 117.8125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2485005334019661,
      "epoch": 0.11134784622510421,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003974970430135727,
      "kl": 0.0022931023268029094,
      "learning_rate": 9.77739694302918e-07,
      "loss": 0.0001,
      "num_tokens": 66043560.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2404,
      "step_time": 12.82517571374774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 208.875,
      "completions/mean_terminated_length": 208.875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.38329140841960907,
      "epoch": 0.11139416396479852,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10050483047962189,
      "kl": 0.004255613719578832,
      "learning_rate": 9.777304307549791e-07,
      "loss": 0.0685,
      "num_tokens": 66075606.0,
      "reward": 0.3422679305076599,
      "reward_std": 0.456741601228714,
      "rewards/reward_func/mean": 0.3422679305076599,
      "rewards/reward_func/std": 0.4567416310310364,
      "step": 2405,
      "step_time": 25.608222983777523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 195.25,
      "completions/mean_terminated_length": 195.25,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.1917690858244896,
      "epoch": 0.11144048170449282,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002935251221060753,
      "kl": 0.0018773071642499417,
      "learning_rate": 9.777211672070403e-07,
      "loss": 0.0001,
      "num_tokens": 66113274.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2406,
      "step_time": 23.862790696322918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 173.0,
      "completions/mean_terminated_length": 173.0,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.23762915655970573,
      "epoch": 0.11148679944418713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015553135890513659,
      "kl": 0.0013095731555949897,
      "learning_rate": 9.777119036591014e-07,
      "loss": 0.0001,
      "num_tokens": 66134202.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2407,
      "step_time": 17.738466504961252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 207.375,
      "completions/mean_terminated_length": 207.375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.3728785365819931,
      "epoch": 0.11153311718388143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06992531567811966,
      "kl": 0.0023069052840583026,
      "learning_rate": 9.777026401111625e-07,
      "loss": -0.0483,
      "num_tokens": 66162672.0,
      "reward": 0.9311447143554688,
      "reward_std": 0.017449283972382545,
      "rewards/reward_func/mean": 0.9311447143554688,
      "rewards/reward_func/std": 0.0174492746591568,
      "step": 2408,
      "step_time": 21.930220041424036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 164.625,
      "completions/mean_terminated_length": 164.625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4227030873298645,
      "epoch": 0.11157943492357573,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027652329299598932,
      "kl": 0.0028306868043728173,
      "learning_rate": 9.776933765632236e-07,
      "loss": 0.0001,
      "num_tokens": 66193946.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2409,
      "step_time": 19.52329556643963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 185.9375,
      "completions/mean_terminated_length": 185.9375,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.39986591041088104,
      "epoch": 0.11162575266327003,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017531183548271656,
      "kl": 0.0017838965286500752,
      "learning_rate": 9.776841130152848e-07,
      "loss": 0.0001,
      "num_tokens": 66224825.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2410,
      "step_time": 22.347840026021004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 166.0,
      "completions/mean_terminated_length": 166.0,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.38478899747133255,
      "epoch": 0.11167207040296434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011761828325688839,
      "kl": 0.006792910513468087,
      "learning_rate": 9.776748494673459e-07,
      "loss": 0.0003,
      "num_tokens": 66247289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2411,
      "step_time": 18.65980200096965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 131.6875,
      "completions/mean_terminated_length": 131.6875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.27228139340877533,
      "epoch": 0.11171838814265864,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008121551014482975,
      "kl": 0.0026825943496078253,
      "learning_rate": 9.77665585919407e-07,
      "loss": 0.0001,
      "num_tokens": 66266740.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2412,
      "step_time": 14.016145922243595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 125.375,
      "completions/mean_terminated_length": 125.375,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.26529572159051895,
      "epoch": 0.11176470588235295,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031022189650684595,
      "kl": 0.002264054666738957,
      "learning_rate": 9.776563223714681e-07,
      "loss": 0.0001,
      "num_tokens": 66289402.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2413,
      "step_time": 14.946750197559595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 117.125,
      "completions/mean_terminated_length": 117.125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.23947672545909882,
      "epoch": 0.11181102362204724,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002311090938746929,
      "kl": 0.0017960050900001079,
      "learning_rate": 9.776470588235295e-07,
      "loss": 0.0001,
      "num_tokens": 66309196.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2414,
      "step_time": 15.363229483366013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 175.3125,
      "completions/mean_terminated_length": 175.3125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4139101505279541,
      "epoch": 0.11185734136174155,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007246461696922779,
      "kl": 0.004397790529765189,
      "learning_rate": 9.776377952755906e-07,
      "loss": 0.0002,
      "num_tokens": 66345873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2415,
      "step_time": 23.650139447301626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 165.0625,
      "completions/mean_terminated_length": 165.0625,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.4154718965291977,
      "epoch": 0.11190365910143585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018806760199368,
      "kl": 0.002184557670261711,
      "learning_rate": 9.776285317276517e-07,
      "loss": 0.0001,
      "num_tokens": 66398242.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2416,
      "step_time": 24.525605008006096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 149.4375,
      "completions/mean_terminated_length": 149.4375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.22699890658259392,
      "epoch": 0.11194997684113016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001381315989419818,
      "kl": 0.001268629333935678,
      "learning_rate": 9.776192681797128e-07,
      "loss": 0.0001,
      "num_tokens": 66420873.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2417,
      "step_time": 17.389300521463156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 151.5,
      "completions/mean_terminated_length": 151.5,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.33007028698921204,
      "epoch": 0.11199629458082445,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019194837659597397,
      "kl": 0.0016540546203032136,
      "learning_rate": 9.77610004631774e-07,
      "loss": 0.0001,
      "num_tokens": 66441745.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2418,
      "step_time": 17.41394756361842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 194.0625,
      "completions/mean_terminated_length": 194.0625,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.21471988409757614,
      "epoch": 0.11204261232051876,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034169431310147047,
      "kl": 0.0028793515521101654,
      "learning_rate": 9.77600741083835e-07,
      "loss": 0.0001,
      "num_tokens": 66470034.0,
      "reward": 0.5765653252601624,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5765653252601624,
      "rewards/reward_func/std": 0.0,
      "step": 2419,
      "step_time": 20.30102963745594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 137.0,
      "completions/mean_terminated_length": 137.0,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2592247426509857,
      "epoch": 0.11208893006021306,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009942702949047089,
      "kl": 0.0028831594972871244,
      "learning_rate": 9.775914775358962e-07,
      "loss": 0.0001,
      "num_tokens": 66490578.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2420,
      "step_time": 15.012401573359966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 236.1875,
      "completions/mean_terminated_length": 236.1875,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "entropy": 0.2608606889843941,
      "epoch": 0.11213524779990737,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07392553240060806,
      "kl": 0.003077751083765179,
      "learning_rate": 9.775822139879573e-07,
      "loss": -0.0107,
      "num_tokens": 66522869.0,
      "reward": 0.5973880887031555,
      "reward_std": 0.41606178879737854,
      "rewards/reward_func/mean": 0.5973880887031555,
      "rewards/reward_func/std": 0.41606178879737854,
      "step": 2421,
      "step_time": 27.28056574985385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 188.625,
      "completions/mean_terminated_length": 188.625,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.22484597191214561,
      "epoch": 0.11218156553960167,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002945329761132598,
      "kl": 0.002411195106105879,
      "learning_rate": 9.775729504400185e-07,
      "loss": 0.0001,
      "num_tokens": 66544639.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2422,
      "step_time": 19.510927099734545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 235.5625,
      "completions/mean_terminated_length": 235.5625,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.41190581023693085,
      "epoch": 0.11222788327929598,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08581096678972244,
      "kl": 0.006259789690375328,
      "learning_rate": 9.775636868920796e-07,
      "loss": -0.1313,
      "num_tokens": 66578328.0,
      "reward": 0.15155190229415894,
      "reward_std": 0.2728007137775421,
      "rewards/reward_func/mean": 0.15155190229415894,
      "rewards/reward_func/std": 0.2728007137775421,
      "step": 2423,
      "step_time": 32.226893462240696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 147.625,
      "completions/mean_terminated_length": 147.625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.29873067885637283,
      "epoch": 0.11227420101899027,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002238903194665909,
      "kl": 0.0017924793064594269,
      "learning_rate": 9.775544233441407e-07,
      "loss": 0.0001,
      "num_tokens": 66599282.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2424,
      "step_time": 16.00865687429905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 179.4375,
      "completions/mean_terminated_length": 179.4375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.419899620115757,
      "epoch": 0.11232051875868458,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004521338734775782,
      "kl": 0.0033975655678659678,
      "learning_rate": 9.775451597962018e-07,
      "loss": 0.0002,
      "num_tokens": 66621449.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2425,
      "step_time": 18.19551219791174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 178.3125,
      "completions/mean_terminated_length": 178.3125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3378634974360466,
      "epoch": 0.11236683649837888,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003737666178494692,
      "kl": 0.0030273490119725466,
      "learning_rate": 9.77535896248263e-07,
      "loss": 0.0002,
      "num_tokens": 66647182.0,
      "reward": 0.5647181272506714,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5647181272506714,
      "rewards/reward_func/std": 0.0,
      "step": 2426,
      "step_time": 21.057276505976915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 131.125,
      "completions/mean_terminated_length": 131.125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2890597730875015,
      "epoch": 0.11241315423807319,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018485448090359569,
      "kl": 0.0013900745252612978,
      "learning_rate": 9.775266327003243e-07,
      "loss": 0.0001,
      "num_tokens": 66683168.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2427,
      "step_time": 18.160382740199566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 154.75,
      "completions/mean_terminated_length": 154.75,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.4234418570995331,
      "epoch": 0.11245947197776748,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002827100455760956,
      "kl": 0.0024213457363657653,
      "learning_rate": 9.775173691523854e-07,
      "loss": 0.0001,
      "num_tokens": 66745916.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2428,
      "step_time": 28.01572649553418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 125.9375,
      "completions/mean_terminated_length": 125.9375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3015022277832031,
      "epoch": 0.1125057897174618,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028600769583135843,
      "kl": 0.0019417548028286546,
      "learning_rate": 9.775081056044463e-07,
      "loss": 0.0001,
      "num_tokens": 66766667.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2429,
      "step_time": 13.572780143469572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 355.0,
      "completions/max_terminated_length": 355.0,
      "completions/mean_length": 198.0625,
      "completions/mean_terminated_length": 198.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3871123269200325,
      "epoch": 0.11255210745715609,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09472401440143585,
      "kl": 0.007698494824580848,
      "learning_rate": 9.774988420565075e-07,
      "loss": -0.1976,
      "num_tokens": 66805388.0,
      "reward": 0.05747421085834503,
      "reward_std": 0.22989685833454132,
      "rewards/reward_func/mean": 0.05747421085834503,
      "rewards/reward_func/std": 0.22989685833454132,
      "step": 2430,
      "step_time": 32.72367901727557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 140.5,
      "completions/mean_terminated_length": 140.5,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.30869461596012115,
      "epoch": 0.1125984251968504,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023353789001703262,
      "kl": 0.0014139002305455506,
      "learning_rate": 9.774895785085688e-07,
      "loss": 0.0001,
      "num_tokens": 66825540.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2431,
      "step_time": 14.9496890604496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 158.25,
      "completions/mean_terminated_length": 158.25,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3768110051751137,
      "epoch": 0.1126447429365447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021692905575037003,
      "kl": 0.0018430263153277338,
      "learning_rate": 9.7748031496063e-07,
      "loss": 0.0001,
      "num_tokens": 66852248.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2432,
      "step_time": 18.83238858729601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 181.8125,
      "completions/mean_terminated_length": 181.8125,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2170022577047348,
      "epoch": 0.112691060676239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002259301021695137,
      "kl": 0.001681604073382914,
      "learning_rate": 9.77471051412691e-07,
      "loss": 0.0001,
      "num_tokens": 66874341.0,
      "reward": 0.5934875011444092,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5934875011444092,
      "rewards/reward_func/std": 0.0,
      "step": 2433,
      "step_time": 19.728549901396036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 175.0,
      "completions/mean_terminated_length": 175.0,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.42328501492738724,
      "epoch": 0.1127373784159333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011259117163717747,
      "kl": 0.0062677806708961725,
      "learning_rate": 9.774617878647522e-07,
      "loss": 0.0003,
      "num_tokens": 66903013.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2434,
      "step_time": 20.135791525244713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 240.6875,
      "completions/mean_terminated_length": 240.6875,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.2761561721563339,
      "epoch": 0.11278369615562761,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07958658784627914,
      "kl": 0.008481328841298819,
      "learning_rate": 9.774525243168133e-07,
      "loss": 0.0165,
      "num_tokens": 66932784.0,
      "reward": 0.8905339241027832,
      "reward_std": 0.04503028094768524,
      "rewards/reward_func/mean": 0.8905339241027832,
      "rewards/reward_func/std": 0.04503028839826584,
      "step": 2435,
      "step_time": 24.84669253230095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 198.25,
      "completions/mean_terminated_length": 198.25,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.40073180198669434,
      "epoch": 0.1128300138953219,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11505474150180817,
      "kl": 0.006373452895786613,
      "learning_rate": 9.774432607688744e-07,
      "loss": 0.0427,
      "num_tokens": 66969812.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 2436,
      "step_time": 24.155964501202106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 186.125,
      "completions/mean_terminated_length": 186.125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.26683806255459785,
      "epoch": 0.11287633163501622,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002324956003576517,
      "kl": 0.0019464888609945774,
      "learning_rate": 9.774339972209356e-07,
      "loss": 0.0001,
      "num_tokens": 67008854.0,
      "reward": 0.022873464971780777,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.022873464971780777,
      "rewards/reward_func/std": 0.0,
      "step": 2437,
      "step_time": 23.67304378002882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 215.9375,
      "completions/mean_terminated_length": 215.9375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.40574753284454346,
      "epoch": 0.11292264937471051,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08100494742393494,
      "kl": 0.0038533667102456093,
      "learning_rate": 9.774247336729967e-07,
      "loss": -0.1,
      "num_tokens": 67035349.0,
      "reward": 0.029568437486886978,
      "reward_std": 0.09081331640481949,
      "rewards/reward_func/mean": 0.029568437486886978,
      "rewards/reward_func/std": 0.09081331640481949,
      "step": 2438,
      "step_time": 25.900131553411484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.875,
      "completions/mean_terminated_length": 132.875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.21693570539355278,
      "epoch": 0.11296896711440482,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15751352906227112,
      "kl": 0.0033534825779497623,
      "learning_rate": 9.774154701250578e-07,
      "loss": -0.0064,
      "num_tokens": 67058275.0,
      "reward": 0.9160261154174805,
      "reward_std": 0.027147367596626282,
      "rewards/reward_func/mean": 0.9160261154174805,
      "rewards/reward_func/std": 0.027147362008690834,
      "step": 2439,
      "step_time": 14.807053968310356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 215.9375,
      "completions/mean_terminated_length": 215.9375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.2589073069393635,
      "epoch": 0.11301528485409912,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1171020120382309,
      "kl": 0.0051482305862009525,
      "learning_rate": 9.77406206577119e-07,
      "loss": -0.1343,
      "num_tokens": 67096722.0,
      "reward": 0.23852992057800293,
      "reward_std": 0.39258697628974915,
      "rewards/reward_func/mean": 0.23852992057800293,
      "rewards/reward_func/std": 0.39258700609207153,
      "step": 2440,
      "step_time": 27.961682315915823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 127.6875,
      "completions/mean_terminated_length": 127.6875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.30557843297719955,
      "epoch": 0.11306160259379343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004527962300926447,
      "kl": 0.0023293305130209774,
      "learning_rate": 9.7739694302918e-07,
      "loss": 0.0001,
      "num_tokens": 67116317.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2441,
      "step_time": 13.561669934540987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 134.6875,
      "completions/mean_terminated_length": 134.6875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.3187221437692642,
      "epoch": 0.11310792033348772,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003435620805248618,
      "kl": 0.001834153721574694,
      "learning_rate": 9.773876794812412e-07,
      "loss": 0.0001,
      "num_tokens": 67145624.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2442,
      "step_time": 17.71932018175721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.42840543389320374,
      "epoch": 0.11315423807318203,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034899311140179634,
      "kl": 0.002661260892637074,
      "learning_rate": 9.773784159333023e-07,
      "loss": 0.0001,
      "num_tokens": 67190375.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2443,
      "step_time": 23.76982979103923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 160.1875,
      "completions/mean_terminated_length": 160.1875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2066311314702034,
      "epoch": 0.11320055581287633,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016302632866427302,
      "kl": 0.0012858854897785932,
      "learning_rate": 9.773691523853636e-07,
      "loss": 0.0001,
      "num_tokens": 67217018.0,
      "reward": 0.6227038502693176,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6227038502693176,
      "rewards/reward_func/std": 0.0,
      "step": 2444,
      "step_time": 17.93243756890297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 181.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3939415439963341,
      "epoch": 0.11324687355257064,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01033829990774393,
      "kl": 0.004616707505192608,
      "learning_rate": 9.773598888374248e-07,
      "loss": 0.0002,
      "num_tokens": 67252991.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2445,
      "step_time": 22.318165626376867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 220.1875,
      "completions/mean_terminated_length": 220.1875,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.4108148366212845,
      "epoch": 0.11329319129226494,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09378018230199814,
      "kl": 0.003654955537058413,
      "learning_rate": 9.773506252894859e-07,
      "loss": -0.0269,
      "num_tokens": 67285250.0,
      "reward": 0.11413758993148804,
      "reward_std": 0.31188327074050903,
      "rewards/reward_func/mean": 0.11413758993148804,
      "rewards/reward_func/std": 0.31188327074050903,
      "step": 2446,
      "step_time": 23.56105723977089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 120.3125,
      "completions/mean_terminated_length": 120.3125,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.29247353225946426,
      "epoch": 0.11333950903195925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023144776932895184,
      "kl": 0.0017470692691858858,
      "learning_rate": 9.77341361741547e-07,
      "loss": 0.0001,
      "num_tokens": 67309751.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2447,
      "step_time": 15.461145281791687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 214.5,
      "completions/mean_terminated_length": 214.5,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.3118080571293831,
      "epoch": 0.11338582677165354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09761524945497513,
      "kl": 0.0035818603937514126,
      "learning_rate": 9.773320981936081e-07,
      "loss": -0.0051,
      "num_tokens": 67333135.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 2448,
      "step_time": 22.700984682887793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.17893319949507713,
      "epoch": 0.11343214451134785,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028546981047838926,
      "kl": 0.00166060306946747,
      "learning_rate": 9.773228346456693e-07,
      "loss": 0.0001,
      "num_tokens": 67362628.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2449,
      "step_time": 17.541016452014446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 161.75,
      "completions/mean_terminated_length": 161.75,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.39219270646572113,
      "epoch": 0.11347846225104215,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016609176527708769,
      "kl": 0.0018543552432674915,
      "learning_rate": 9.773135710977304e-07,
      "loss": 0.0001,
      "num_tokens": 67396688.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2450,
      "step_time": 20.098556522279978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 116.0,
      "completions/max_terminated_length": 116.0,
      "completions/mean_length": 100.8125,
      "completions/mean_terminated_length": 100.8125,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.253155704587698,
      "epoch": 0.11352477999073646,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006407290231436491,
      "kl": 0.001897846581414342,
      "learning_rate": 9.773043075497915e-07,
      "loss": 0.0001,
      "num_tokens": 67415869.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2451,
      "step_time": 11.394967649132013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 203.5625,
      "completions/mean_terminated_length": 203.5625,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.24599649012088776,
      "epoch": 0.11357109773043075,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018369624158367515,
      "kl": 0.0014612114755436778,
      "learning_rate": 9.772950440018526e-07,
      "loss": 0.0001,
      "num_tokens": 67450502.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 2452,
      "step_time": 25.18342625722289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 185.9375,
      "completions/mean_terminated_length": 185.9375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.18431689217686653,
      "epoch": 0.11361741547012506,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08234444260597229,
      "kl": 0.001167101989267394,
      "learning_rate": 9.772857804539138e-07,
      "loss": 0.0121,
      "num_tokens": 67472933.0,
      "reward": 0.9698399305343628,
      "reward_std": 0.03531983122229576,
      "rewards/reward_func/mean": 0.9698399305343628,
      "rewards/reward_func/std": 0.03531982749700546,
      "step": 2453,
      "step_time": 20.30852472409606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 163.5625,
      "completions/mean_terminated_length": 163.5625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3077933043241501,
      "epoch": 0.11366373320981936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002004193374887109,
      "kl": 0.0016860596660990268,
      "learning_rate": 9.772765169059749e-07,
      "loss": 0.0001,
      "num_tokens": 67494798.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2454,
      "step_time": 19.051043465733528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 127.0625,
      "completions/mean_terminated_length": 127.0625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3176931366324425,
      "epoch": 0.11371005094951367,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023375828750431538,
      "kl": 0.0020944648422300816,
      "learning_rate": 9.77267253358036e-07,
      "loss": 0.0001,
      "num_tokens": 67523487.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2455,
      "step_time": 16.009984277188778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 194.5,
      "completions/mean_terminated_length": 194.5,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.37010613828897476,
      "epoch": 0.11375636868920797,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0046645901165902615,
      "kl": 0.0030380228417925537,
      "learning_rate": 9.772579898100971e-07,
      "loss": 0.0002,
      "num_tokens": 67545591.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2456,
      "step_time": 20.34482028707862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 149.875,
      "completions/mean_terminated_length": 149.875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2914397120475769,
      "epoch": 0.11380268642890227,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021178536117076874,
      "kl": 0.0018803998245857656,
      "learning_rate": 9.772487262621585e-07,
      "loss": 0.0001,
      "num_tokens": 67581829.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2457,
      "step_time": 20.24321200698614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 129.1875,
      "completions/mean_terminated_length": 129.1875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.29282809793949127,
      "epoch": 0.11384900416859657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017254592385143042,
      "kl": 0.0013090102875139564,
      "learning_rate": 9.772394627142196e-07,
      "loss": 0.0001,
      "num_tokens": 67604856.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2458,
      "step_time": 14.605748381465673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 120.25,
      "completions/mean_terminated_length": 120.25,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2606007978320122,
      "epoch": 0.11389532190829088,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004506888799369335,
      "kl": 0.0016115727194119245,
      "learning_rate": 9.772301991662807e-07,
      "loss": 0.0001,
      "num_tokens": 67627132.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2459,
      "step_time": 14.128421925008297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 135.6875,
      "completions/mean_terminated_length": 135.6875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2472737394273281,
      "epoch": 0.11394163964798518,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004102014470845461,
      "kl": 0.0022828959627076983,
      "learning_rate": 9.772209356183416e-07,
      "loss": 0.0001,
      "num_tokens": 67646807.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2460,
      "step_time": 14.82594895362854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 184.125,
      "completions/mean_terminated_length": 184.125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.41472016274929047,
      "epoch": 0.11398795738767949,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002807251876220107,
      "kl": 0.0025067999376915395,
      "learning_rate": 9.77211672070403e-07,
      "loss": 0.0001,
      "num_tokens": 67689193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2461,
      "step_time": 22.917190868407488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 156.6875,
      "completions/mean_terminated_length": 156.6875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.24410531669855118,
      "epoch": 0.11403427512737378,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002435762668028474,
      "kl": 0.0017704214551486075,
      "learning_rate": 9.77202408522464e-07,
      "loss": 0.0001,
      "num_tokens": 67709908.0,
      "reward": 0.8242367506027222,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8242367506027222,
      "rewards/reward_func/std": 0.0,
      "step": 2462,
      "step_time": 17.05655563622713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 188.875,
      "completions/mean_terminated_length": 188.875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.39517487585544586,
      "epoch": 0.11408059286706809,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10505260527133942,
      "kl": 0.002370894537307322,
      "learning_rate": 9.771931449745252e-07,
      "loss": 0.0833,
      "num_tokens": 67730962.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 2463,
      "step_time": 22.13871493563056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 159.5625,
      "completions/mean_terminated_length": 159.5625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3184591829776764,
      "epoch": 0.11412691060676239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003013376612216234,
      "kl": 0.0021464183810167015,
      "learning_rate": 9.771838814265863e-07,
      "loss": 0.0001,
      "num_tokens": 67754427.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2464,
      "step_time": 16.785066470503807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 350.0,
      "completions/max_terminated_length": 350.0,
      "completions/mean_length": 236.5625,
      "completions/mean_terminated_length": 236.5625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.4302835613489151,
      "epoch": 0.1141732283464567,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09732300043106079,
      "kl": 0.0053596136276610196,
      "learning_rate": 9.771746178786475e-07,
      "loss": -0.0337,
      "num_tokens": 67778484.0,
      "reward": 0.11073075234889984,
      "reward_std": 0.3025740087032318,
      "rewards/reward_func/mean": 0.11073075234889984,
      "rewards/reward_func/std": 0.3025740087032318,
      "step": 2465,
      "step_time": 28.65038428083062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 163.0625,
      "completions/mean_terminated_length": 163.0625,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.19304820522665977,
      "epoch": 0.114219546086151,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001923121279105544,
      "kl": 0.0011828722199425101,
      "learning_rate": 9.771653543307086e-07,
      "loss": 0.0001,
      "num_tokens": 67800149.0,
      "reward": 0.8464817404747009,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8464817404747009,
      "rewards/reward_func/std": 0.0,
      "step": 2466,
      "step_time": 17.765714410692453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 156.1875,
      "completions/mean_terminated_length": 156.1875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.27853118628263474,
      "epoch": 0.1142658638258453,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007093959022313356,
      "kl": 0.003363996569532901,
      "learning_rate": 9.771560907827697e-07,
      "loss": 0.0002,
      "num_tokens": 67823848.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2467,
      "step_time": 18.3814713396132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 166.625,
      "completions/mean_terminated_length": 166.625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.44477756321430206,
      "epoch": 0.1143121815655396,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014332630671560764,
      "kl": 0.0017497410881333053,
      "learning_rate": 9.771468272348308e-07,
      "loss": 0.0001,
      "num_tokens": 67871458.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2468,
      "step_time": 23.765099693089724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 175.6875,
      "completions/mean_terminated_length": 175.6875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.191434808075428,
      "epoch": 0.11435849930523391,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014370506396517158,
      "kl": 0.001366449665511027,
      "learning_rate": 9.77137563686892e-07,
      "loss": 0.0001,
      "num_tokens": 67909229.0,
      "reward": 0.9214109182357788,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9214109182357788,
      "rewards/reward_func/std": 0.0,
      "step": 2469,
      "step_time": 22.32357655465603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 212.1875,
      "completions/mean_terminated_length": 212.1875,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.3637712076306343,
      "epoch": 0.1144048170449282,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07481560856103897,
      "kl": 0.005705999210476875,
      "learning_rate": 9.77128300138953e-07,
      "loss": -0.003,
      "num_tokens": 67933520.0,
      "reward": 0.17361770570278168,
      "reward_std": 0.3732668459415436,
      "rewards/reward_func/mean": 0.17361770570278168,
      "rewards/reward_func/std": 0.3732668459415436,
      "step": 2470,
      "step_time": 24.43019162490964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 201.625,
      "completions/mean_terminated_length": 201.625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.40962257981300354,
      "epoch": 0.11445113478462252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007986698299646378,
      "kl": 0.0065677674720063806,
      "learning_rate": 9.771190365910144e-07,
      "loss": 0.0003,
      "num_tokens": 67959370.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2471,
      "step_time": 21.364749550819397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 204.9375,
      "completions/mean_terminated_length": 204.9375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.33088134974241257,
      "epoch": 0.11449745252431681,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10680411756038666,
      "kl": 0.0036308920243754983,
      "learning_rate": 9.771097730430753e-07,
      "loss": 0.0064,
      "num_tokens": 67987625.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 2472,
      "step_time": 22.0128981359303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 155.0625,
      "completions/mean_terminated_length": 155.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.1786942407488823,
      "epoch": 0.11454377026401112,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005208641290664673,
      "kl": 0.0022907425882294774,
      "learning_rate": 9.771005094951365e-07,
      "loss": 0.0001,
      "num_tokens": 68009706.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2473,
      "step_time": 15.87685788795352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.1408357098698616,
      "epoch": 0.11459008800370542,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003429030068218708,
      "kl": 0.0022986862750258297,
      "learning_rate": 9.770912459471978e-07,
      "loss": 0.0001,
      "num_tokens": 68029555.0,
      "reward": 0.51341712474823,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.51341712474823,
      "rewards/reward_func/std": 0.0,
      "step": 2474,
      "step_time": 12.78066784888506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 193.625,
      "completions/mean_terminated_length": 193.625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.42769351601600647,
      "epoch": 0.11463640574339973,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001790591748431325,
      "kl": 0.0019036740413866937,
      "learning_rate": 9.77081982399259e-07,
      "loss": 0.0001,
      "num_tokens": 68057197.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2475,
      "step_time": 21.227740541100502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 122.375,
      "completions/mean_terminated_length": 122.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.32360443472862244,
      "epoch": 0.11468272348309402,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008208557032048702,
      "kl": 0.003330700274091214,
      "learning_rate": 9.7707271885132e-07,
      "loss": 0.0002,
      "num_tokens": 68077907.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2476,
      "step_time": 14.623869501054287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 191.9375,
      "completions/mean_terminated_length": 191.9375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4546266943216324,
      "epoch": 0.11472904122278833,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006665271706879139,
      "kl": 0.004630137351341546,
      "learning_rate": 9.770634553033812e-07,
      "loss": 0.0002,
      "num_tokens": 68107554.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2477,
      "step_time": 21.555527418851852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 162.125,
      "completions/mean_terminated_length": 162.125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.27639149501919746,
      "epoch": 0.11477535896248263,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004952006973326206,
      "kl": 0.0035169259645044804,
      "learning_rate": 9.770541917554423e-07,
      "loss": 0.0002,
      "num_tokens": 68131668.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 2478,
      "step_time": 18.94213031604886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 177.125,
      "completions/mean_terminated_length": 177.125,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3737163841724396,
      "epoch": 0.11482167670217694,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038529098965227604,
      "kl": 0.0026498447987250984,
      "learning_rate": 9.770449282075034e-07,
      "loss": 0.0001,
      "num_tokens": 68157302.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2479,
      "step_time": 19.80051399767399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 194.4375,
      "completions/mean_terminated_length": 194.4375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.30508412420749664,
      "epoch": 0.11486799444187124,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007849538698792458,
      "kl": 0.0031641992973163724,
      "learning_rate": 9.770356646595646e-07,
      "loss": 0.0002,
      "num_tokens": 68192989.0,
      "reward": 0.5081327557563782,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5081327557563782,
      "rewards/reward_func/std": 0.0,
      "step": 2480,
      "step_time": 22.772242203354836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 155.4375,
      "completions/mean_terminated_length": 155.4375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3316722735762596,
      "epoch": 0.11491431218156554,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002754985122010112,
      "kl": 0.0022676190128549933,
      "learning_rate": 9.770264011116257e-07,
      "loss": 0.0001,
      "num_tokens": 68217060.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2481,
      "step_time": 16.53058822080493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 334.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 246.0,
      "completions/mean_terminated_length": 246.0,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.3607417270541191,
      "epoch": 0.11496062992125984,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10553093254566193,
      "kl": 0.01073522213846445,
      "learning_rate": 9.770171375636868e-07,
      "loss": -0.1059,
      "num_tokens": 68251236.0,
      "reward": 0.3325149118900299,
      "reward_std": 0.4026811718940735,
      "rewards/reward_func/mean": 0.3325149118900299,
      "rewards/reward_func/std": 0.4026811718940735,
      "step": 2482,
      "step_time": 30.127475015819073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 147.625,
      "completions/mean_terminated_length": 147.625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3244210407137871,
      "epoch": 0.11500694766095415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023280037567019463,
      "kl": 0.0018257657357025892,
      "learning_rate": 9.77007874015748e-07,
      "loss": 0.0001,
      "num_tokens": 68276110.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2483,
      "step_time": 18.436642192304134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 135.8125,
      "completions/mean_terminated_length": 135.8125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.20642798021435738,
      "epoch": 0.11505326540064845,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002512483624741435,
      "kl": 0.0014322706556413323,
      "learning_rate": 9.769986104678093e-07,
      "loss": 0.0001,
      "num_tokens": 68295835.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2484,
      "step_time": 14.553882360458374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 145.625,
      "completions/mean_terminated_length": 145.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.32794320583343506,
      "epoch": 0.11509958314034276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017755960579961538,
      "kl": 0.0019293739460408688,
      "learning_rate": 9.769893469198702e-07,
      "loss": 0.0001,
      "num_tokens": 68327093.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2485,
      "step_time": 18.01482929289341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 157.0625,
      "completions/mean_terminated_length": 157.0625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.4321519210934639,
      "epoch": 0.11514590088003705,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014660722808912396,
      "kl": 0.002090283203870058,
      "learning_rate": 9.769800833719313e-07,
      "loss": 0.0001,
      "num_tokens": 68370150.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2486,
      "step_time": 21.97736431285739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 158.125,
      "completions/mean_terminated_length": 158.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.26701557636260986,
      "epoch": 0.11519221861973136,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029601918067783117,
      "kl": 0.0023622983135282993,
      "learning_rate": 9.769708198239926e-07,
      "loss": 0.0001,
      "num_tokens": 68393512.0,
      "reward": 0.7958667874336243,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7958667874336243,
      "rewards/reward_func/std": 0.0,
      "step": 2487,
      "step_time": 16.45381862297654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.29334940016269684,
      "epoch": 0.11523853635942566,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.121180459856987,
      "kl": 0.0024091697414405644,
      "learning_rate": 9.769615562760538e-07,
      "loss": -0.0464,
      "num_tokens": 68417407.0,
      "reward": 0.18374274671077728,
      "reward_std": 0.010824820958077908,
      "rewards/reward_func/mean": 0.18374274671077728,
      "rewards/reward_func/std": 0.010824819095432758,
      "step": 2488,
      "step_time": 18.614742059260607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.0,
      "completions/max_terminated_length": 127.0,
      "completions/mean_length": 112.25,
      "completions/mean_terminated_length": 112.25,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.26405394822359085,
      "epoch": 0.11528485409911997,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002262190915644169,
      "kl": 0.0015944482001941651,
      "learning_rate": 9.769522927281149e-07,
      "loss": 0.0001,
      "num_tokens": 68440739.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2489,
      "step_time": 13.781056050211191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 137.6875,
      "completions/mean_terminated_length": 137.6875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3129817843437195,
      "epoch": 0.11533117183881426,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002422789577394724,
      "kl": 0.001710813317913562,
      "learning_rate": 9.76943029180176e-07,
      "loss": 0.0001,
      "num_tokens": 68462494.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2490,
      "step_time": 14.91152261197567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 160.875,
      "completions/mean_terminated_length": 160.875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4279804676771164,
      "epoch": 0.11537748957850857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019432164262980223,
      "kl": 0.0018543907208368182,
      "learning_rate": 9.769337656322371e-07,
      "loss": 0.0001,
      "num_tokens": 68495004.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2491,
      "step_time": 21.28641689941287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 183.0,
      "completions/mean_terminated_length": 183.0,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.35045985877513885,
      "epoch": 0.11542380731820287,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009118014015257359,
      "kl": 0.005484711611643434,
      "learning_rate": 9.769245020842983e-07,
      "loss": 0.0003,
      "num_tokens": 68532764.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2492,
      "step_time": 22.926713228225708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 199.5625,
      "completions/mean_terminated_length": 199.5625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.32855086028575897,
      "epoch": 0.11547012505789718,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10711812973022461,
      "kl": 0.008030668599531054,
      "learning_rate": 9.769152385363594e-07,
      "loss": -0.0518,
      "num_tokens": 68570245.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 2493,
      "step_time": 24.291283402591944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 179.5625,
      "completions/mean_terminated_length": 179.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.384690523147583,
      "epoch": 0.11551644279759148,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048539177514612675,
      "kl": 0.0027220979100093246,
      "learning_rate": 9.769059749884205e-07,
      "loss": 0.0001,
      "num_tokens": 68601886.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2494,
      "step_time": 21.380510710179806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 135.25,
      "completions/mean_terminated_length": 135.25,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2969150096178055,
      "epoch": 0.11556276053728579,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0058605242520570755,
      "kl": 0.002608942857477814,
      "learning_rate": 9.768967114404816e-07,
      "loss": 0.0001,
      "num_tokens": 68624962.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2495,
      "step_time": 14.64000740274787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 213.0,
      "completions/mean_terminated_length": 213.0,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.28155994415283203,
      "epoch": 0.11560907827698008,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12155213952064514,
      "kl": 0.0028275814838707447,
      "learning_rate": 9.768874478925428e-07,
      "loss": 0.0209,
      "num_tokens": 68653154.0,
      "reward": 0.9307626485824585,
      "reward_std": 0.05584240332245827,
      "rewards/reward_func/mean": 0.9307626485824585,
      "rewards/reward_func/std": 0.05584241822361946,
      "step": 2496,
      "step_time": 21.322447326034307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 178.6875,
      "completions/mean_terminated_length": 178.6875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.3451504111289978,
      "epoch": 0.11565539601667439,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00205622217617929,
      "kl": 0.0020811408176086843,
      "learning_rate": 9.768781843446039e-07,
      "loss": 0.0001,
      "num_tokens": 68683149.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2497,
      "step_time": 19.86602247133851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 195.6875,
      "completions/mean_terminated_length": 195.6875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.22228306159377098,
      "epoch": 0.11570171375636869,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10164478421211243,
      "kl": 0.0013924910745117813,
      "learning_rate": 9.76868920796665e-07,
      "loss": 0.0231,
      "num_tokens": 68705768.0,
      "reward": 0.8223152160644531,
      "reward_std": 0.05801927670836449,
      "rewards/reward_func/mean": 0.8223152160644531,
      "rewards/reward_func/std": 0.058019280433654785,
      "step": 2498,
      "step_time": 19.609724581241608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.10717789456248283,
      "epoch": 0.115748031496063,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024656946770846844,
      "kl": 0.0008034743805183098,
      "learning_rate": 9.768596572487261e-07,
      "loss": 0.0,
      "num_tokens": 68744029.0,
      "reward": 0.7598356604576111,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7598356604576111,
      "rewards/reward_func/std": 0.0,
      "step": 2499,
      "step_time": 19.850645527243614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 152.9375,
      "completions/mean_terminated_length": 152.9375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.24117330461740494,
      "epoch": 0.1157943492357573,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00201165908947587,
      "kl": 0.001557450246764347,
      "learning_rate": 9.768503937007873e-07,
      "loss": 0.0001,
      "num_tokens": 68765308.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2500,
      "step_time": 16.42570138722658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 166.125,
      "completions/mean_terminated_length": 166.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.380124993622303,
      "epoch": 0.1158406669754516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0053825657814741135,
      "kl": 0.003818499739281833,
      "learning_rate": 9.768411301528486e-07,
      "loss": 0.0002,
      "num_tokens": 68789294.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2501,
      "step_time": 20.48052605614066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 251.5625,
      "completions/mean_terminated_length": 251.5625,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.20027782395482063,
      "epoch": 0.1158869847151459,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12447930872440338,
      "kl": 0.007670757360756397,
      "learning_rate": 9.768318666049097e-07,
      "loss": -0.1226,
      "num_tokens": 68815015.0,
      "reward": 0.904274046421051,
      "reward_std": 0.2615731358528137,
      "rewards/reward_func/mean": 0.904274046421051,
      "rewards/reward_func/std": 0.2615731358528137,
      "step": 2502,
      "step_time": 25.985407132655382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 128.1875,
      "completions/mean_terminated_length": 128.1875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3250097781419754,
      "epoch": 0.11593330245484021,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003276855917647481,
      "kl": 0.002020838321186602,
      "learning_rate": 9.768226030569706e-07,
      "loss": 0.0001,
      "num_tokens": 68835066.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2503,
      "step_time": 14.984255533665419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 112.6875,
      "completions/mean_terminated_length": 112.6875,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 0.28627268224954605,
      "epoch": 0.1159796201945345,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021926662884652615,
      "kl": 0.0016013141721487045,
      "learning_rate": 9.76813339509032e-07,
      "loss": 0.0001,
      "num_tokens": 68856005.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2504,
      "step_time": 13.714712552726269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 135.125,
      "completions/mean_terminated_length": 135.125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.26569344848394394,
      "epoch": 0.11602593793422881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037235531490296125,
      "kl": 0.0022294174996204674,
      "learning_rate": 9.76804075961093e-07,
      "loss": 0.0001,
      "num_tokens": 68878823.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2505,
      "step_time": 15.604820631444454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 185.625,
      "completions/mean_terminated_length": 185.625,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.24384716153144836,
      "epoch": 0.11607225567392311,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025913529098033905,
      "kl": 0.002238882676465437,
      "learning_rate": 9.767948124131542e-07,
      "loss": 0.0001,
      "num_tokens": 68903745.0,
      "reward": 0.5384570360183716,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5384570360183716,
      "rewards/reward_func/std": 0.0,
      "step": 2506,
      "step_time": 19.494688913226128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 123.9375,
      "completions/mean_terminated_length": 123.9375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2504768893122673,
      "epoch": 0.11611857341361742,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003581949509680271,
      "kl": 0.0018356178188696504,
      "learning_rate": 9.767855488652154e-07,
      "loss": 0.0001,
      "num_tokens": 68924256.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2507,
      "step_time": 13.948278229683638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 194.6875,
      "completions/mean_terminated_length": 194.6875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4146835431456566,
      "epoch": 0.11616489115331172,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004226749762892723,
      "kl": 0.0034904314088635147,
      "learning_rate": 9.767762853172765e-07,
      "loss": 0.0002,
      "num_tokens": 68953003.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2508,
      "step_time": 22.90442780032754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 171.5,
      "completions/mean_terminated_length": 171.5,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.17129074409604073,
      "epoch": 0.11621120889300603,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003777128178626299,
      "kl": 0.002778336522169411,
      "learning_rate": 9.767670217693376e-07,
      "loss": 0.0001,
      "num_tokens": 68975363.0,
      "reward": 0.7091062068939209,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7091062068939209,
      "rewards/reward_func/std": 0.0,
      "step": 2509,
      "step_time": 17.512671183794737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 151.5,
      "completions/mean_terminated_length": 151.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.1635386347770691,
      "epoch": 0.11625752663270032,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027541702147573233,
      "kl": 0.001805927458917722,
      "learning_rate": 9.767577582213987e-07,
      "loss": 0.0001,
      "num_tokens": 68996411.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2510,
      "step_time": 15.2217398583889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 141.3125,
      "completions/mean_terminated_length": 141.3125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.22284924611449242,
      "epoch": 0.11630384437239463,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014736582525074482,
      "kl": 0.0010546468110987917,
      "learning_rate": 9.767484946734599e-07,
      "loss": 0.0001,
      "num_tokens": 69016128.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2511,
      "step_time": 15.050313018262386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 132.5,
      "completions/mean_terminated_length": 132.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3137256056070328,
      "epoch": 0.11635016211208893,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004238894674926996,
      "kl": 0.0024793065967969596,
      "learning_rate": 9.76739231125521e-07,
      "loss": 0.0001,
      "num_tokens": 69040616.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2512,
      "step_time": 15.094173938035965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 121.3125,
      "completions/mean_terminated_length": 121.3125,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2705349326133728,
      "epoch": 0.11639647985178324,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013780392473563552,
      "kl": 0.0012340772082097828,
      "learning_rate": 9.76729967577582e-07,
      "loss": 0.0001,
      "num_tokens": 69063181.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2513,
      "step_time": 14.087721854448318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 190.8125,
      "completions/mean_terminated_length": 190.8125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.15664460882544518,
      "epoch": 0.11644279759147753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003150130156427622,
      "kl": 0.0015929357905406505,
      "learning_rate": 9.767207040296434e-07,
      "loss": 0.0001,
      "num_tokens": 69100618.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2514,
      "step_time": 21.97874530404806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 139.75,
      "completions/mean_terminated_length": 139.75,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3434393182396889,
      "epoch": 0.11648911533117184,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027103142347186804,
      "kl": 0.0019133105815853924,
      "learning_rate": 9.767114404817044e-07,
      "loss": 0.0001,
      "num_tokens": 69125222.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2515,
      "step_time": 16.80498769879341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 156.75,
      "completions/mean_terminated_length": 156.75,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3457399159669876,
      "epoch": 0.11653543307086614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031414953991770744,
      "kl": 0.0023865659604780376,
      "learning_rate": 9.767021769337655e-07,
      "loss": 0.0001,
      "num_tokens": 69150946.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2516,
      "step_time": 17.40716779232025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 150.0,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.22851867228746414,
      "epoch": 0.11658175081056045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013847298687323928,
      "kl": 0.001284622703678906,
      "learning_rate": 9.766929133858268e-07,
      "loss": 0.0001,
      "num_tokens": 69171474.0,
      "reward": 0.3384654223918915,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3384654223918915,
      "rewards/reward_func/std": 0.0,
      "step": 2517,
      "step_time": 16.074111629277468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 153.25,
      "completions/mean_terminated_length": 153.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3559677377343178,
      "epoch": 0.11662806855025475,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003983703907579184,
      "kl": 0.00274868356063962,
      "learning_rate": 9.76683649837888e-07,
      "loss": 0.0001,
      "num_tokens": 69195398.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2518,
      "step_time": 16.23988012596965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 201.25,
      "completions/mean_terminated_length": 201.25,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.4453985244035721,
      "epoch": 0.11667438628994906,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12077568471431732,
      "kl": 0.00353471894050017,
      "learning_rate": 9.76674386289949e-07,
      "loss": 0.0761,
      "num_tokens": 69216186.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2519,
      "step_time": 24.598925530910492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 144.3125,
      "completions/mean_terminated_length": 144.3125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.26991620659828186,
      "epoch": 0.11672070402964335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01442884560674429,
      "kl": 0.0031042208429425955,
      "learning_rate": 9.766651227420102e-07,
      "loss": 0.0002,
      "num_tokens": 69236959.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2520,
      "step_time": 16.014219731092453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 132.0625,
      "completions/mean_terminated_length": 132.0625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2830207869410515,
      "epoch": 0.11676702176933766,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031016753055155277,
      "kl": 0.002233900304418057,
      "learning_rate": 9.766558591940713e-07,
      "loss": 0.0001,
      "num_tokens": 69260544.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2521,
      "step_time": 16.300537552684546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 178.8125,
      "completions/mean_terminated_length": 178.8125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.27052196860313416,
      "epoch": 0.11681333950903196,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13894326984882355,
      "kl": 0.004763779055792838,
      "learning_rate": 9.766465956461324e-07,
      "loss": 0.0117,
      "num_tokens": 69291805.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2522,
      "step_time": 20.154803916811943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 159.9375,
      "completions/mean_terminated_length": 159.9375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.40736597776412964,
      "epoch": 0.11685965724872627,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026998703833669424,
      "kl": 0.0018361476249992847,
      "learning_rate": 9.766373320981936e-07,
      "loss": 0.0001,
      "num_tokens": 69330492.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2523,
      "step_time": 21.67042900249362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3571884483098984,
      "epoch": 0.11690597498842056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11671131104230881,
      "kl": 0.010688660433515906,
      "learning_rate": 9.766280685502547e-07,
      "loss": 0.0162,
      "num_tokens": 69351553.0,
      "reward": 0.7045598030090332,
      "reward_std": 0.4201183021068573,
      "rewards/reward_func/mean": 0.7045598030090332,
      "rewards/reward_func/std": 0.4201183021068573,
      "step": 2524,
      "step_time": 19.222447354346514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 132.8125,
      "completions/mean_terminated_length": 132.8125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.34327084571123123,
      "epoch": 0.11695229272811487,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003010922111570835,
      "kl": 0.0019749950733967125,
      "learning_rate": 9.766188050023158e-07,
      "loss": 0.0001,
      "num_tokens": 69377950.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2525,
      "step_time": 16.51058854907751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 207.0,
      "completions/mean_terminated_length": 207.0,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.3339350149035454,
      "epoch": 0.11699861046780917,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12125305831432343,
      "kl": 0.001954075414687395,
      "learning_rate": 9.76609541454377e-07,
      "loss": 0.0955,
      "num_tokens": 69399086.0,
      "reward": 0.6449456214904785,
      "reward_std": 0.17198549211025238,
      "rewards/reward_func/mean": 0.6449456214904785,
      "rewards/reward_func/std": 0.17198549211025238,
      "step": 2526,
      "step_time": 23.677948355674744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 161.9375,
      "completions/mean_terminated_length": 161.9375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.26828376948833466,
      "epoch": 0.11704492820750348,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028099219780415297,
      "kl": 0.0018537837022449821,
      "learning_rate": 9.766002779064383e-07,
      "loss": 0.0001,
      "num_tokens": 69419805.0,
      "reward": 0.7632111310958862,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7632111310958862,
      "rewards/reward_func/std": 0.0,
      "step": 2527,
      "step_time": 16.765187423676252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 168.0625,
      "completions/mean_terminated_length": 168.0625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.4304979592561722,
      "epoch": 0.11709124594719778,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002772212726995349,
      "kl": 0.0025947302929125726,
      "learning_rate": 9.765910143584992e-07,
      "loss": 0.0001,
      "num_tokens": 69452286.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2528,
      "step_time": 20.656904868781567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 165.3125,
      "completions/mean_terminated_length": 165.3125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3324563130736351,
      "epoch": 0.11713756368689208,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13781264424324036,
      "kl": 0.005630438216030598,
      "learning_rate": 9.765817508105603e-07,
      "loss": 0.0223,
      "num_tokens": 69473795.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 2529,
      "step_time": 20.35880806297064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 345.0,
      "completions/max_terminated_length": 345.0,
      "completions/mean_length": 258.0,
      "completions/mean_terminated_length": 258.0,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.1971878558397293,
      "epoch": 0.11718388142658638,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0050765895284712315,
      "kl": 0.003672609105706215,
      "learning_rate": 9.765724872626214e-07,
      "loss": 0.0002,
      "num_tokens": 69498147.0,
      "reward": 0.8343905210494995,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8343905210494995,
      "rewards/reward_func/std": 0.0,
      "step": 2530,
      "step_time": 29.195719808340073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 306.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 264.5625,
      "completions/mean_terminated_length": 264.5625,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "entropy": 0.19106394052505493,
      "epoch": 0.11723019916628069,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002047973684966564,
      "kl": 0.0019100763311143965,
      "learning_rate": 9.765632237146828e-07,
      "loss": 0.0001,
      "num_tokens": 69524460.0,
      "reward": 0.795195996761322,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.795195996761322,
      "rewards/reward_func/std": 0.0,
      "step": 2531,
      "step_time": 25.892326433211565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 154.625,
      "completions/mean_terminated_length": 154.625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.23522640764713287,
      "epoch": 0.11727651690597499,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13442489504814148,
      "kl": 0.004400189907755703,
      "learning_rate": 9.76553960166744e-07,
      "loss": -0.0669,
      "num_tokens": 69546870.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2532,
      "step_time": 18.030234690755606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 132.75,
      "completions/mean_terminated_length": 132.75,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.30949801206588745,
      "epoch": 0.1173228346456693,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004218805581331253,
      "kl": 0.002690243854885921,
      "learning_rate": 9.76544696618805e-07,
      "loss": 0.0001,
      "num_tokens": 69582722.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2533,
      "step_time": 18.262815680354834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 169.625,
      "completions/mean_terminated_length": 169.625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.4334608465433121,
      "epoch": 0.11736915238536359,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019446579972282052,
      "kl": 0.0019211559265386313,
      "learning_rate": 9.765354330708661e-07,
      "loss": 0.0001,
      "num_tokens": 69603804.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2534,
      "step_time": 17.4964744374156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 163.8125,
      "completions/mean_terminated_length": 163.8125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3907167837023735,
      "epoch": 0.1174154701250579,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008374964818358421,
      "kl": 0.0043603170197457075,
      "learning_rate": 9.765261695229273e-07,
      "loss": 0.0002,
      "num_tokens": 69633849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2535,
      "step_time": 18.502758789807558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 102.5,
      "completions/mean_terminated_length": 102.5,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "entropy": 0.2020980902016163,
      "epoch": 0.1174617878647522,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015470476355403662,
      "kl": 0.001157489256002009,
      "learning_rate": 9.765169059749884e-07,
      "loss": 0.0001,
      "num_tokens": 69653137.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2536,
      "step_time": 12.262867517769337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 115.875,
      "completions/mean_terminated_length": 115.875,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2808125540614128,
      "epoch": 0.11750810560444651,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0043172030709683895,
      "kl": 0.0021788671147078276,
      "learning_rate": 9.765076424270495e-07,
      "loss": 0.0001,
      "num_tokens": 69672479.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2537,
      "step_time": 13.167741309851408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 143.3125,
      "completions/mean_terminated_length": 143.3125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2784837633371353,
      "epoch": 0.1175544233441408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005533813498914242,
      "kl": 0.003838031552731991,
      "learning_rate": 9.764983788791106e-07,
      "loss": 0.0002,
      "num_tokens": 69693860.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2538,
      "step_time": 14.989644382148981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 128.8125,
      "completions/mean_terminated_length": 128.8125,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.342710942029953,
      "epoch": 0.11760074108383511,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002589267445728183,
      "kl": 0.0018046482873614877,
      "learning_rate": 9.764891153311718e-07,
      "loss": 0.0001,
      "num_tokens": 69717617.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2539,
      "step_time": 15.151292867958546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 357.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 314.25,
      "completions/mean_terminated_length": 314.25,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "entropy": 0.2687339186668396,
      "epoch": 0.11764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07287666201591492,
      "kl": 0.002109242486767471,
      "learning_rate": 9.76479851783233e-07,
      "loss": -0.0628,
      "num_tokens": 69749941.0,
      "reward": 0.1672072559595108,
      "reward_std": 0.08809083700180054,
      "rewards/reward_func/mean": 0.1672072559595108,
      "rewards/reward_func/std": 0.08809084445238113,
      "step": 2540,
      "step_time": 31.04066489636898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 132.9375,
      "completions/mean_terminated_length": 132.9375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.27311180531978607,
      "epoch": 0.11769337656322372,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028962416108697653,
      "kl": 0.0020032948814332485,
      "learning_rate": 9.76470588235294e-07,
      "loss": 0.0001,
      "num_tokens": 69773412.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2541,
      "step_time": 14.571888111531734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 166.4375,
      "completions/mean_terminated_length": 166.4375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3374975621700287,
      "epoch": 0.11773969430291802,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031090453267097473,
      "kl": 0.001794400392100215,
      "learning_rate": 9.764613246873551e-07,
      "loss": 0.0001,
      "num_tokens": 69793563.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2542,
      "step_time": 17.30877362936735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 152.125,
      "completions/mean_terminated_length": 152.125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3300483003258705,
      "epoch": 0.11778601204261233,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032420754432678223,
      "kl": 0.0022386827040463686,
      "learning_rate": 9.764520611394163e-07,
      "loss": 0.0001,
      "num_tokens": 69814685.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2543,
      "step_time": 15.987787026911974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 206.0625,
      "completions/mean_terminated_length": 206.0625,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.1973423846065998,
      "epoch": 0.11783232978230662,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005508746951818466,
      "kl": 0.00416121503803879,
      "learning_rate": 9.764427975914776e-07,
      "loss": 0.0002,
      "num_tokens": 69848958.0,
      "reward": 0.6807124018669128,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6807124018669128,
      "rewards/reward_func/std": 0.0,
      "step": 2544,
      "step_time": 23.922963842749596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.29599516093730927,
      "epoch": 0.11787864752200093,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034114622976630926,
      "kl": 0.0027078649727627635,
      "learning_rate": 9.764335340435387e-07,
      "loss": 0.0001,
      "num_tokens": 69877846.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2545,
      "step_time": 17.332238290458918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 206.0,
      "completions/mean_terminated_length": 206.0,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.1840292140841484,
      "epoch": 0.11792496526169523,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019980361685156822,
      "kl": 0.0013126096746418625,
      "learning_rate": 9.764242704955996e-07,
      "loss": 0.0001,
      "num_tokens": 69928726.0,
      "reward": 0.9091564416885376,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9091564416885376,
      "rewards/reward_func/std": 0.0,
      "step": 2546,
      "step_time": 27.377622980624437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 177.3125,
      "completions/mean_terminated_length": 177.3125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.37352460622787476,
      "epoch": 0.11797128300138954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01190211158245802,
      "kl": 0.006103089544922113,
      "learning_rate": 9.76415006947661e-07,
      "loss": 0.0003,
      "num_tokens": 69952907.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2547,
      "step_time": 19.16219438239932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 173.0625,
      "completions/mean_terminated_length": 173.0625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.29012996703386307,
      "epoch": 0.11801760074108383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029143390711396933,
      "kl": 0.0019404477789066732,
      "learning_rate": 9.764057433997221e-07,
      "loss": 0.0001,
      "num_tokens": 69982636.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2548,
      "step_time": 19.47384374216199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3809279501438141,
      "epoch": 0.11806391848077814,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0041020638309419155,
      "kl": 0.0033375016937498003,
      "learning_rate": 9.763964798517832e-07,
      "loss": 0.0002,
      "num_tokens": 70019802.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2549,
      "step_time": 22.1692301556468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 237.75,
      "completions/mean_terminated_length": 237.75,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "entropy": 0.3624110445380211,
      "epoch": 0.11811023622047244,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11295898258686066,
      "kl": 0.005656790337525308,
      "learning_rate": 9.763872163038444e-07,
      "loss": -0.1305,
      "num_tokens": 70046406.0,
      "reward": 0.3129962086677551,
      "reward_std": 0.46819740533828735,
      "rewards/reward_func/mean": 0.3129962086677551,
      "rewards/reward_func/std": 0.46819743514060974,
      "step": 2550,
      "step_time": 28.486052256077528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 208.25,
      "completions/mean_terminated_length": 208.25,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.37478768825531006,
      "epoch": 0.11815655396016675,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1396738439798355,
      "kl": 0.004766521626152098,
      "learning_rate": 9.763779527559055e-07,
      "loss": 0.0838,
      "num_tokens": 70071898.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 2551,
      "step_time": 25.988287832587957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 351.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 252.3125,
      "completions/mean_terminated_length": 252.3125,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.3208429887890816,
      "epoch": 0.11820287169986105,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08668361604213715,
      "kl": 0.004409196495544165,
      "learning_rate": 9.763686892079666e-07,
      "loss": -0.1385,
      "num_tokens": 70111055.0,
      "reward": 0.34896644949913025,
      "reward_std": 0.3645111322402954,
      "rewards/reward_func/mean": 0.34896644949913025,
      "rewards/reward_func/std": 0.3645111620426178,
      "step": 2552,
      "step_time": 35.64171186834574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 242.125,
      "completions/mean_terminated_length": 242.125,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.4924083277583122,
      "epoch": 0.11824918943955535,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09804312139749527,
      "kl": 0.0031258187373168766,
      "learning_rate": 9.763594256600277e-07,
      "loss": 0.0219,
      "num_tokens": 70136737.0,
      "reward": 0.5625,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.5625,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 2553,
      "step_time": 29.227614890784025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 219.75,
      "completions/mean_terminated_length": 219.75,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.29411546885967255,
      "epoch": 0.11829550717924965,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001870762906037271,
      "kl": 0.0017390170833095908,
      "learning_rate": 9.763501621120889e-07,
      "loss": 0.0001,
      "num_tokens": 70158845.0,
      "reward": 0.5308194756507874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5308194756507874,
      "rewards/reward_func/std": 0.0,
      "step": 2554,
      "step_time": 23.829374082386494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 109.3125,
      "completions/mean_terminated_length": 109.3125,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.3609594851732254,
      "epoch": 0.11834182491894396,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00425384845584631,
      "kl": 0.002779354923404753,
      "learning_rate": 9.7634089856415e-07,
      "loss": 0.0001,
      "num_tokens": 70184034.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2555,
      "step_time": 14.10405632853508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 159.9375,
      "completions/mean_terminated_length": 159.9375,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3891973942518234,
      "epoch": 0.11838814265863826,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011609444627538323,
      "kl": 0.0014020515664014965,
      "learning_rate": 9.763316350162111e-07,
      "loss": 0.0001,
      "num_tokens": 70240625.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2556,
      "step_time": 25.749696049839258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 143.0625,
      "completions/mean_terminated_length": 143.0625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.35119353979825974,
      "epoch": 0.11843446039833257,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004023308400064707,
      "kl": 0.0023010563745629042,
      "learning_rate": 9.763223714682724e-07,
      "loss": 0.0001,
      "num_tokens": 70276882.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2557,
      "step_time": 19.22999330982566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 157.5625,
      "completions/mean_terminated_length": 157.5625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.3454890549182892,
      "epoch": 0.11848077813802686,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004776523914188147,
      "kl": 0.003007733845151961,
      "learning_rate": 9.763131079203334e-07,
      "loss": 0.0001,
      "num_tokens": 70299003.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2558,
      "step_time": 17.241932556033134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.1966676041483879,
      "epoch": 0.11852709587772117,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001377631677314639,
      "kl": 0.0009226291585946456,
      "learning_rate": 9.763038443723945e-07,
      "loss": 0.0,
      "num_tokens": 70332252.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 2559,
      "step_time": 20.28189316019416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 378.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 352.0625,
      "completions/mean_terminated_length": 352.0625,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "entropy": 0.1965409442782402,
      "epoch": 0.11857341361741547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001986697083339095,
      "kl": 0.0017113542126026005,
      "learning_rate": 9.762945808244556e-07,
      "loss": 0.0001,
      "num_tokens": 70374653.0,
      "reward": 0.7498524785041809,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7498524785041809,
      "rewards/reward_func/std": 0.0,
      "step": 2560,
      "step_time": 37.20276174321771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 218.4375,
      "completions/mean_terminated_length": 218.4375,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.41453102231025696,
      "epoch": 0.11861973135710978,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035396451130509377,
      "kl": 0.002845336392056197,
      "learning_rate": 9.76285317276517e-07,
      "loss": 0.0001,
      "num_tokens": 70399076.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2561,
      "step_time": 26.513055469840765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 174.125,
      "completions/mean_terminated_length": 174.125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.36704379320144653,
      "epoch": 0.11866604909680407,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027061717119067907,
      "kl": 0.002521127520594746,
      "learning_rate": 9.76276053728578e-07,
      "loss": 0.0001,
      "num_tokens": 70423558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2562,
      "step_time": 19.267679549753666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 130.8125,
      "completions/mean_terminated_length": 130.8125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.32686904817819595,
      "epoch": 0.11871236683649838,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027874025981873274,
      "kl": 0.002084024017676711,
      "learning_rate": 9.762667901806392e-07,
      "loss": 0.0001,
      "num_tokens": 70444867.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2563,
      "step_time": 14.60996313393116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 165.125,
      "completions/mean_terminated_length": 165.125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.26866399496793747,
      "epoch": 0.11875868457619268,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003486102446913719,
      "kl": 0.002615977020468563,
      "learning_rate": 9.762575266327003e-07,
      "loss": 0.0001,
      "num_tokens": 70468373.0,
      "reward": 0.780767560005188,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.780767560005188,
      "rewards/reward_func/std": 0.0,
      "step": 2564,
      "step_time": 17.069831989705563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 132.875,
      "completions/mean_terminated_length": 132.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.17322808876633644,
      "epoch": 0.11880500231588699,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13548368215560913,
      "kl": 0.003650828613899648,
      "learning_rate": 9.762482630847614e-07,
      "loss": -0.0752,
      "num_tokens": 70491299.0,
      "reward": 0.3941449522972107,
      "reward_std": 0.10506203025579453,
      "rewards/reward_func/mean": 0.3941449522972107,
      "rewards/reward_func/std": 0.10506203770637512,
      "step": 2565,
      "step_time": 16.4831387065351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 105.0,
      "completions/mean_terminated_length": 105.0,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.2161022536456585,
      "epoch": 0.11885132005558129,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002563272602856159,
      "kl": 0.001570798660395667,
      "learning_rate": 9.762389995368226e-07,
      "loss": 0.0001,
      "num_tokens": 70511267.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2566,
      "step_time": 12.12009947001934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 186.25,
      "completions/mean_terminated_length": 186.25,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3114311099052429,
      "epoch": 0.1188976377952756,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12095338851213455,
      "kl": 0.00406394392484799,
      "learning_rate": 9.762297359888837e-07,
      "loss": 0.0562,
      "num_tokens": 70534231.0,
      "reward": 0.7375170588493347,
      "reward_std": 0.19667121767997742,
      "rewards/reward_func/mean": 0.7375170588493347,
      "rewards/reward_func/std": 0.19667121767997742,
      "step": 2567,
      "step_time": 20.278707768768072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 132.75,
      "completions/mean_terminated_length": 132.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2700130119919777,
      "epoch": 0.11894395553496989,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033549421932548285,
      "kl": 0.002145721489796415,
      "learning_rate": 9.762204724409448e-07,
      "loss": 0.0001,
      "num_tokens": 70554931.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2568,
      "step_time": 15.19302299246192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 203.8125,
      "completions/mean_terminated_length": 203.8125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.40337076783180237,
      "epoch": 0.1189902732746642,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08717337250709534,
      "kl": 0.005992951802909374,
      "learning_rate": 9.76211208893006e-07,
      "loss": -0.0625,
      "num_tokens": 70579136.0,
      "reward": 0.05871332064270973,
      "reward_std": 0.23485326766967773,
      "rewards/reward_func/mean": 0.05871332064270973,
      "rewards/reward_func/std": 0.23485326766967773,
      "step": 2569,
      "step_time": 25.097491294145584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 160.625,
      "completions/mean_terminated_length": 160.625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.34616900235414505,
      "epoch": 0.1190365910143585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039023745339363813,
      "kl": 0.0027815004577860236,
      "learning_rate": 9.76201945345067e-07,
      "loss": 0.0001,
      "num_tokens": 70600954.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2570,
      "step_time": 16.051095638424158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 157.4375,
      "completions/mean_terminated_length": 157.4375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.23404672741889954,
      "epoch": 0.11908290875405281,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005644877906888723,
      "kl": 0.0047567912843078375,
      "learning_rate": 9.761926817971282e-07,
      "loss": 0.0002,
      "num_tokens": 70630833.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2571,
      "step_time": 18.684407092630863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 145.8125,
      "completions/mean_terminated_length": 145.8125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.324734590947628,
      "epoch": 0.1191292264937471,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038916415069252253,
      "kl": 0.0022496864548884332,
      "learning_rate": 9.761834182491893e-07,
      "loss": 0.0001,
      "num_tokens": 70651374.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2572,
      "step_time": 15.262799922376871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 270.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 199.75,
      "completions/mean_terminated_length": 199.75,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3342023342847824,
      "epoch": 0.11917554423344141,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1105809360742569,
      "kl": 0.00803664920385927,
      "learning_rate": 9.761741547012504e-07,
      "loss": -0.1154,
      "num_tokens": 70680602.0,
      "reward": 0.11411845684051514,
      "reward_std": 0.311830997467041,
      "rewards/reward_func/mean": 0.11411845684051514,
      "rewards/reward_func/std": 0.311830997467041,
      "step": 2573,
      "step_time": 24.5421023927629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 187.375,
      "completions/mean_terminated_length": 187.375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3873288184404373,
      "epoch": 0.11922186197313571,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023885746486485004,
      "kl": 0.0021763765544164926,
      "learning_rate": 9.761648911533118e-07,
      "loss": 0.0001,
      "num_tokens": 70718064.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2574,
      "step_time": 23.26786745339632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 161.1875,
      "completions/mean_terminated_length": 161.1875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.2526915520429611,
      "epoch": 0.11926817971283002,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004863819573074579,
      "kl": 0.003397050779312849,
      "learning_rate": 9.76155627605373e-07,
      "loss": 0.0002,
      "num_tokens": 70754659.0,
      "reward": 0.45559218525886536,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.45559218525886536,
      "rewards/reward_func/std": 0.0,
      "step": 2575,
      "step_time": 20.47814941033721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2699509561061859,
      "epoch": 0.11931449745252432,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004671419970691204,
      "kl": 0.002445378544507548,
      "learning_rate": 9.76146364057434e-07,
      "loss": 0.0001,
      "num_tokens": 70774231.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2576,
      "step_time": 14.873398952186108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 189.25,
      "completions/mean_terminated_length": 189.25,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.36087697744369507,
      "epoch": 0.11936081519221863,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034232509788125753,
      "kl": 0.0024271892034448683,
      "learning_rate": 9.76137100509495e-07,
      "loss": 0.0001,
      "num_tokens": 70799515.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2577,
      "step_time": 19.80249995365739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 186.75,
      "completions/mean_terminated_length": 186.75,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3625039979815483,
      "epoch": 0.11940713293191292,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09588645398616791,
      "kl": 0.0036418578820303082,
      "learning_rate": 9.761278369615563e-07,
      "loss": 0.0728,
      "num_tokens": 70823959.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2578,
      "step_time": 21.55460439249873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 196.3125,
      "completions/mean_terminated_length": 196.3125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.29934902489185333,
      "epoch": 0.11945345067160723,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09274392575025558,
      "kl": 0.005649623810313642,
      "learning_rate": 9.761185734136174e-07,
      "loss": -0.0318,
      "num_tokens": 70847436.0,
      "reward": 0.5290153622627258,
      "reward_std": 0.42527130246162415,
      "rewards/reward_func/mean": 0.5290153622627258,
      "rewards/reward_func/std": 0.42527130246162415,
      "step": 2579,
      "step_time": 19.9176363684237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.29691868275403976,
      "epoch": 0.11949976841130153,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006261197850108147,
      "kl": 0.003267173538915813,
      "learning_rate": 9.761093098656785e-07,
      "loss": 0.0002,
      "num_tokens": 70868916.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2580,
      "step_time": 13.36207140609622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 141.4375,
      "completions/mean_terminated_length": 141.4375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3163677826523781,
      "epoch": 0.11954608615099584,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002074207179248333,
      "kl": 0.0016918099427130073,
      "learning_rate": 9.761000463177397e-07,
      "loss": 0.0001,
      "num_tokens": 70895595.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2581,
      "step_time": 16.02123297378421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 197.75,
      "completions/mean_terminated_length": 197.75,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.4043612703680992,
      "epoch": 0.11959240389069013,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08586780726909637,
      "kl": 0.005323318357113749,
      "learning_rate": 9.760907827698008e-07,
      "loss": -0.0612,
      "num_tokens": 70925303.0,
      "reward": 0.11031211167573929,
      "reward_std": 0.3014300763607025,
      "rewards/reward_func/mean": 0.11031211167573929,
      "rewards/reward_func/std": 0.3014300763607025,
      "step": 2582,
      "step_time": 23.118612952530384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 155.3125,
      "completions/mean_terminated_length": 155.3125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.19518357142806053,
      "epoch": 0.11963872163038444,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030153447296470404,
      "kl": 0.0020839257049374282,
      "learning_rate": 9.76081519221862e-07,
      "loss": 0.0001,
      "num_tokens": 70947020.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 2583,
      "step_time": 16.24068732187152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 188.625,
      "completions/mean_terminated_length": 188.625,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.37758927047252655,
      "epoch": 0.11968503937007874,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12246187776327133,
      "kl": 0.002398409938905388,
      "learning_rate": 9.76072255673923e-07,
      "loss": -0.0719,
      "num_tokens": 70971526.0,
      "reward": 0.05173708125948906,
      "reward_std": 0.20694832503795624,
      "rewards/reward_func/mean": 0.05173708125948906,
      "rewards/reward_func/std": 0.20694833993911743,
      "step": 2584,
      "step_time": 21.39801001176238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 139.125,
      "completions/mean_terminated_length": 139.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.39037761837244034,
      "epoch": 0.11973135710977305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014450823655351996,
      "kl": 0.0017771337588783354,
      "learning_rate": 9.760629921259842e-07,
      "loss": 0.0001,
      "num_tokens": 71013768.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2585,
      "step_time": 20.699903801083565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 176.375,
      "completions/mean_terminated_length": 176.375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3848694711923599,
      "epoch": 0.11977767484946734,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005286119412630796,
      "kl": 0.0035452512674964964,
      "learning_rate": 9.760537285780453e-07,
      "loss": 0.0002,
      "num_tokens": 71035806.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2586,
      "step_time": 19.202445048838854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 118.8125,
      "completions/mean_terminated_length": 118.8125,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3058849424123764,
      "epoch": 0.11982399258916165,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005261395126581192,
      "kl": 0.002903709071688354,
      "learning_rate": 9.760444650301066e-07,
      "loss": 0.0001,
      "num_tokens": 71055963.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2587,
      "step_time": 13.46128412336111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 156.125,
      "completions/mean_terminated_length": 156.125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.1507360003888607,
      "epoch": 0.11987031032885595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009575008880347013,
      "kl": 0.0008947111200541258,
      "learning_rate": 9.760352014821677e-07,
      "loss": 0.0,
      "num_tokens": 71077789.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2588,
      "step_time": 16.469407685101032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 134.625,
      "completions/mean_terminated_length": 134.625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.25572817400097847,
      "epoch": 0.11991662806855026,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002382407197728753,
      "kl": 0.0014073697093408555,
      "learning_rate": 9.760259379342287e-07,
      "loss": 0.0001,
      "num_tokens": 71099015.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2589,
      "step_time": 15.378913719207048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 168.375,
      "completions/mean_terminated_length": 168.375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.24062542989850044,
      "epoch": 0.11996294580824456,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005165655631572008,
      "kl": 0.005093358689919114,
      "learning_rate": 9.760166743862898e-07,
      "loss": 0.0003,
      "num_tokens": 71123901.0,
      "reward": 0.26359713077545166,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.26359713077545166,
      "rewards/reward_func/std": 0.0,
      "step": 2590,
      "step_time": 18.162719149142504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 160.6875,
      "completions/mean_terminated_length": 160.6875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.24116867035627365,
      "epoch": 0.12000926354793887,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029185765888541937,
      "kl": 0.001830112305469811,
      "learning_rate": 9.760074108383511e-07,
      "loss": 0.0001,
      "num_tokens": 71148328.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 2591,
      "step_time": 18.324216801673174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 182.9375,
      "completions/mean_terminated_length": 182.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.45394784212112427,
      "epoch": 0.12005558128763316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038516896311193705,
      "kl": 0.0026644898462109268,
      "learning_rate": 9.759981472904122e-07,
      "loss": 0.0001,
      "num_tokens": 71176535.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2592,
      "step_time": 20.339137833565474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 121.6875,
      "completions/mean_terminated_length": 121.6875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.22174376249313354,
      "epoch": 0.12010189902732747,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024807672016322613,
      "kl": 0.0015302609535865486,
      "learning_rate": 9.759888837424734e-07,
      "loss": 0.0001,
      "num_tokens": 71195826.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2593,
      "step_time": 13.547740194946527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 166.4375,
      "completions/mean_terminated_length": 166.4375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.1806536540389061,
      "epoch": 0.12014821676702177,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09671524912118912,
      "kl": 0.0009749295422807336,
      "learning_rate": 9.759796201945345e-07,
      "loss": -0.0581,
      "num_tokens": 71227993.0,
      "reward": 0.9300388693809509,
      "reward_std": 0.027310028672218323,
      "rewards/reward_func/mean": 0.9300388693809509,
      "rewards/reward_func/std": 0.027310030534863472,
      "step": 2594,
      "step_time": 20.047545570880175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 142.4375,
      "completions/mean_terminated_length": 142.4375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.29837728291749954,
      "epoch": 0.12019453450671608,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00585030484944582,
      "kl": 0.0030422385316342115,
      "learning_rate": 9.759703566465956e-07,
      "loss": 0.0002,
      "num_tokens": 71248192.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2595,
      "step_time": 15.379008781164885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 439.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 324.75,
      "completions/mean_terminated_length": 324.75,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "entropy": 0.2556700184941292,
      "epoch": 0.12024085224641037,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07441122084856033,
      "kl": 0.0030584142077714205,
      "learning_rate": 9.759610930986567e-07,
      "loss": -0.2181,
      "num_tokens": 71275212.0,
      "reward": 0.30286088585853577,
      "reward_std": 0.3167165517807007,
      "rewards/reward_func/mean": 0.30286088585853577,
      "rewards/reward_func/std": 0.3167165517807007,
      "step": 2596,
      "step_time": 35.48474219441414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 167.6875,
      "completions/mean_terminated_length": 167.6875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.21612903103232384,
      "epoch": 0.12028716998610468,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008417708799242973,
      "kl": 0.004941831110045314,
      "learning_rate": 9.759518295507179e-07,
      "loss": 0.0002,
      "num_tokens": 71306199.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2597,
      "step_time": 19.600908558815718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 158.0,
      "completions/mean_terminated_length": 158.0,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.277831107378006,
      "epoch": 0.12033348772579898,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005522989667952061,
      "kl": 0.004178764531388879,
      "learning_rate": 9.75942566002779e-07,
      "loss": 0.0002,
      "num_tokens": 71331847.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2598,
      "step_time": 18.852451380342245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 152.5,
      "completions/mean_terminated_length": 152.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.39470214396715164,
      "epoch": 0.12037980546549329,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023999400436878204,
      "kl": 0.0022791545488871634,
      "learning_rate": 9.759333024548401e-07,
      "loss": 0.0001,
      "num_tokens": 71363839.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2599,
      "step_time": 18.758001688867807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 328.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 238.5,
      "completions/mean_terminated_length": 238.5,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.1609300747513771,
      "epoch": 0.12042612320518759,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0843634232878685,
      "kl": 0.0023851240111980587,
      "learning_rate": 9.759240389069012e-07,
      "loss": -0.1551,
      "num_tokens": 71391031.0,
      "reward": 0.7583180069923401,
      "reward_std": 0.3343544900417328,
      "rewards/reward_func/mean": 0.7583180069923401,
      "rewards/reward_func/std": 0.3343545198440552,
      "step": 2600,
      "step_time": 29.403232384473085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 228.125,
      "completions/mean_terminated_length": 228.125,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.44475042819976807,
      "epoch": 0.1204724409448819,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11624807119369507,
      "kl": 0.004848725977353752,
      "learning_rate": 9.759147753589624e-07,
      "loss": -0.0869,
      "num_tokens": 71415833.0,
      "reward": 0.0005881556426174939,
      "reward_std": 0.0023526225704699755,
      "rewards/reward_func/mean": 0.0005881556426174939,
      "rewards/reward_func/std": 0.002352622803300619,
      "step": 2601,
      "step_time": 26.487067949026823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.37038053572177887,
      "epoch": 0.12051875868457619,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11069431155920029,
      "kl": 0.004275827028322965,
      "learning_rate": 9.759055118110235e-07,
      "loss": -0.0373,
      "num_tokens": 71452647.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 2602,
      "step_time": 22.033695995807648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 128.375,
      "completions/mean_terminated_length": 128.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2706213742494583,
      "epoch": 0.1205650764242705,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022401295136660337,
      "kl": 0.001878547394881025,
      "learning_rate": 9.758962482630846e-07,
      "loss": 0.0001,
      "num_tokens": 71472253.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2603,
      "step_time": 14.252593986690044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 270.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 178.9375,
      "completions/mean_terminated_length": 178.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.22292347624897957,
      "epoch": 0.1206113941639648,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1287321150302887,
      "kl": 0.009223586996085942,
      "learning_rate": 9.75886984715146e-07,
      "loss": 0.0937,
      "num_tokens": 71496172.0,
      "reward": 0.8979924917221069,
      "reward_std": 0.28671565651893616,
      "rewards/reward_func/mean": 0.8979924917221069,
      "rewards/reward_func/std": 0.28671562671661377,
      "step": 2604,
      "step_time": 23.865822471678257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.35358650237321854,
      "epoch": 0.1206577119036591,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005480792839080095,
      "kl": 0.004116107884328812,
      "learning_rate": 9.75877721167207e-07,
      "loss": 0.0002,
      "num_tokens": 71517802.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2605,
      "step_time": 17.80082791671157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 212.375,
      "completions/mean_terminated_length": 212.375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.35293833166360855,
      "epoch": 0.1207040296433534,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12171871960163116,
      "kl": 0.006391683709807694,
      "learning_rate": 9.758684576192682e-07,
      "loss": -0.0574,
      "num_tokens": 71540912.0,
      "reward": 0.6539702415466309,
      "reward_std": 0.45536643266677856,
      "rewards/reward_func/mean": 0.6539702415466309,
      "rewards/reward_func/std": 0.45536643266677856,
      "step": 2606,
      "step_time": 22.94267251715064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 379.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 219.0625,
      "completions/mean_terminated_length": 219.0625,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.34524622559547424,
      "epoch": 0.12075034738304771,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07989061623811722,
      "kl": 0.008095955941826105,
      "learning_rate": 9.758591940713293e-07,
      "loss": -0.1107,
      "num_tokens": 71562625.0,
      "reward": 0.5537140369415283,
      "reward_std": 0.5048525929450989,
      "rewards/reward_func/mean": 0.5537140369415283,
      "rewards/reward_func/std": 0.5048525929450989,
      "step": 2607,
      "step_time": 30.270117606967688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 118.125,
      "completions/mean_terminated_length": 118.125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.29393845051527023,
      "epoch": 0.12079666512274201,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019197019282728434,
      "kl": 0.0014777126198168844,
      "learning_rate": 9.758499305233905e-07,
      "loss": 0.0001,
      "num_tokens": 71584819.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2608,
      "step_time": 13.530429046601057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 193.6875,
      "completions/mean_terminated_length": 193.6875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.15781072154641151,
      "epoch": 0.12084298286243632,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004372835159301758,
      "kl": 0.002986446488648653,
      "learning_rate": 9.758406669754516e-07,
      "loss": 0.0001,
      "num_tokens": 71608574.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2609,
      "step_time": 20.004991702735424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 184.8125,
      "completions/mean_terminated_length": 184.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.38397885859012604,
      "epoch": 0.12088930060213061,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.139508455991745,
      "kl": 0.00393298888229765,
      "learning_rate": 9.758314034275127e-07,
      "loss": -0.0203,
      "num_tokens": 71638475.0,
      "reward": 0.05819142237305641,
      "reward_std": 0.23276568949222565,
      "rewards/reward_func/mean": 0.05819142237305641,
      "rewards/reward_func/std": 0.23276568949222565,
      "step": 2610,
      "step_time": 22.404640428721905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 183.125,
      "completions/mean_terminated_length": 183.125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.45040803402662277,
      "epoch": 0.12093561834182492,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020225944463163614,
      "kl": 0.0021493701497092843,
      "learning_rate": 9.758221398795738e-07,
      "loss": 0.0001,
      "num_tokens": 71676509.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2611,
      "step_time": 23.311338245868683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 382.0,
      "completions/max_terminated_length": 382.0,
      "completions/mean_length": 303.5,
      "completions/mean_terminated_length": 303.5,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "entropy": 0.23867250233888626,
      "epoch": 0.12098193608151922,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05662308260798454,
      "kl": 0.003504421270918101,
      "learning_rate": 9.75812876331635e-07,
      "loss": 0.0014,
      "num_tokens": 71716773.0,
      "reward": 0.954357922077179,
      "reward_std": 0.1825682520866394,
      "rewards/reward_func/mean": 0.954357922077179,
      "rewards/reward_func/std": 0.1825682371854782,
      "step": 2612,
      "step_time": 36.65858679264784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 152.25,
      "completions/mean_terminated_length": 152.25,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.253683403134346,
      "epoch": 0.12102825382121353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15901704132556915,
      "kl": 0.007373962085694075,
      "learning_rate": 9.75803612783696e-07,
      "loss": -0.0235,
      "num_tokens": 71737977.0,
      "reward": 0.7204751968383789,
      "reward_std": 0.3126756548881531,
      "rewards/reward_func/mean": 0.7204751968383789,
      "rewards/reward_func/std": 0.3126756548881531,
      "step": 2613,
      "step_time": 17.51973421871662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 199.5,
      "completions/mean_terminated_length": 199.5,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.14996739476919174,
      "epoch": 0.12107457156090783,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07721827179193497,
      "kl": 0.0010218483803328127,
      "learning_rate": 9.757943492357572e-07,
      "loss": -0.0523,
      "num_tokens": 71761713.0,
      "reward": 0.9444707632064819,
      "reward_std": 0.033111196011304855,
      "rewards/reward_func/mean": 0.9444707632064819,
      "rewards/reward_func/std": 0.03311121463775635,
      "step": 2614,
      "step_time": 20.895534090697765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 223.5,
      "completions/mean_terminated_length": 223.5,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.18209245428442955,
      "epoch": 0.12112088930060214,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027935488615185022,
      "kl": 0.0019301688007544726,
      "learning_rate": 9.757850856878183e-07,
      "loss": 0.0001,
      "num_tokens": 71787145.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2615,
      "step_time": 22.16355014592409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.1945566162467003,
      "epoch": 0.12116720704029643,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00685846246778965,
      "kl": 0.0025526779936626554,
      "learning_rate": 9.757758221398794e-07,
      "loss": 0.0001,
      "num_tokens": 71807905.0,
      "reward": 0.8574039340019226,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8574039340019226,
      "rewards/reward_func/std": 0.0,
      "step": 2616,
      "step_time": 15.647626988589764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 143.3125,
      "completions/mean_terminated_length": 143.3125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3358049765229225,
      "epoch": 0.12121352477999074,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012972364202141762,
      "kl": 0.0014078713720664382,
      "learning_rate": 9.757665585919408e-07,
      "loss": 0.0001,
      "num_tokens": 71833318.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2617,
      "step_time": 16.322649911046028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 151.125,
      "completions/mean_terminated_length": 151.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3704020455479622,
      "epoch": 0.12125984251968504,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004383946768939495,
      "kl": 0.002583772409707308,
      "learning_rate": 9.75757295044002e-07,
      "loss": 0.0001,
      "num_tokens": 71856072.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2618,
      "step_time": 16.43452950939536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 203.0,
      "completions/mean_terminated_length": 203.0,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.3492243140935898,
      "epoch": 0.12130616025937935,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1025136262178421,
      "kl": 0.004411309317220002,
      "learning_rate": 9.75748031496063e-07,
      "loss": -0.0108,
      "num_tokens": 71878600.0,
      "reward": 0.7462133169174194,
      "reward_std": 0.4452076554298401,
      "rewards/reward_func/mean": 0.7462133169174194,
      "rewards/reward_func/std": 0.4452076852321625,
      "step": 2619,
      "step_time": 23.544443495571613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 190.75,
      "completions/mean_terminated_length": 190.75,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.25944357365369797,
      "epoch": 0.12135247799907364,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1123427227139473,
      "kl": 0.005246045300737023,
      "learning_rate": 9.75738767948124e-07,
      "loss": -0.0224,
      "num_tokens": 71911508.0,
      "reward": 0.399912029504776,
      "reward_std": 0.3898077607154846,
      "rewards/reward_func/mean": 0.399912029504776,
      "rewards/reward_func/std": 0.3898077607154846,
      "step": 2620,
      "step_time": 22.157683491706848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 146.75,
      "completions/mean_terminated_length": 146.75,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3209523856639862,
      "epoch": 0.12139879573876795,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011952482163906097,
      "kl": 0.001210305985296145,
      "learning_rate": 9.757295044001853e-07,
      "loss": 0.0001,
      "num_tokens": 71937264.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2621,
      "step_time": 15.85201332718134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 172.8125,
      "completions/mean_terminated_length": 172.8125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3508635461330414,
      "epoch": 0.12144511347846225,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022795661352574825,
      "kl": 0.001664304523728788,
      "learning_rate": 9.757202408522464e-07,
      "loss": 0.0001,
      "num_tokens": 71971261.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2622,
      "step_time": 22.573575280606747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 122.5625,
      "completions/mean_terminated_length": 122.5625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.26133843511343,
      "epoch": 0.12149143121815656,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022759540006518364,
      "kl": 0.0015060979349073023,
      "learning_rate": 9.757109773043075e-07,
      "loss": 0.0001,
      "num_tokens": 71990838.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2623,
      "step_time": 14.58366310223937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 165.6875,
      "completions/mean_terminated_length": 165.6875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.24231448769569397,
      "epoch": 0.12153774895785086,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015472517115995288,
      "kl": 0.0014273403503466398,
      "learning_rate": 9.757017137563687e-07,
      "loss": 0.0001,
      "num_tokens": 72016017.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2624,
      "step_time": 17.579478468745947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 227.5,
      "completions/mean_terminated_length": 227.5,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.39033132046461105,
      "epoch": 0.12158406669754517,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07186461240053177,
      "kl": 0.008960372069850564,
      "learning_rate": 9.756924502084298e-07,
      "loss": -0.058,
      "num_tokens": 72049801.0,
      "reward": 0.019167421385645866,
      "reward_std": 0.02770277112722397,
      "rewards/reward_func/mean": 0.019167421385645866,
      "rewards/reward_func/std": 0.02770277112722397,
      "step": 2625,
      "step_time": 28.25566239282489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 312.0,
      "completions/max_terminated_length": 312.0,
      "completions/mean_length": 270.375,
      "completions/mean_terminated_length": 270.375,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "entropy": 0.25303342938423157,
      "epoch": 0.12163038443723946,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027672620490193367,
      "kl": 0.0017814805614762008,
      "learning_rate": 9.75683186660491e-07,
      "loss": 0.0001,
      "num_tokens": 72088959.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2626,
      "step_time": 30.33176765963435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 168.6875,
      "completions/mean_terminated_length": 168.6875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2397082783281803,
      "epoch": 0.12167670217693377,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10525412112474442,
      "kl": 0.002499947266187519,
      "learning_rate": 9.75673923112552e-07,
      "loss": -0.099,
      "num_tokens": 72120266.0,
      "reward": 0.6049246191978455,
      "reward_std": 0.31606027483940125,
      "rewards/reward_func/mean": 0.6049246191978455,
      "rewards/reward_func/std": 0.31606027483940125,
      "step": 2627,
      "step_time": 20.56935388967395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 188.8125,
      "completions/mean_terminated_length": 188.8125,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.41700972616672516,
      "epoch": 0.12172301991662807,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1298043578863144,
      "kl": 0.007502533379010856,
      "learning_rate": 9.756646595646132e-07,
      "loss": -0.1065,
      "num_tokens": 72147207.0,
      "reward": 0.11742663383483887,
      "reward_std": 0.32087063789367676,
      "rewards/reward_func/mean": 0.11742663383483887,
      "rewards/reward_func/std": 0.32087066769599915,
      "step": 2628,
      "step_time": 22.719747003167868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 196.4375,
      "completions/mean_terminated_length": 196.4375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.2458793967962265,
      "epoch": 0.12176933765632238,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003123057307675481,
      "kl": 0.0017426459526177496,
      "learning_rate": 9.756553960166743e-07,
      "loss": 0.0001,
      "num_tokens": 72184718.0,
      "reward": 0.6761743426322937,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6761743426322937,
      "rewards/reward_func/std": 0.0,
      "step": 2629,
      "step_time": 23.518625486642122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 200.25,
      "completions/mean_terminated_length": 200.25,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3304623067378998,
      "epoch": 0.12181565539601667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020174020901322365,
      "kl": 0.004885137430392206,
      "learning_rate": 9.756461324687354e-07,
      "loss": 0.0002,
      "num_tokens": 72224226.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2630,
      "step_time": 26.105417896062136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 194.8125,
      "completions/mean_terminated_length": 194.8125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.39290958642959595,
      "epoch": 0.12186197313571098,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001977944280952215,
      "kl": 0.002162375603802502,
      "learning_rate": 9.756368689207967e-07,
      "loss": 0.0001,
      "num_tokens": 72283327.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2631,
      "step_time": 30.047461956739426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 142.4375,
      "completions/mean_terminated_length": 142.4375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.1866447888314724,
      "epoch": 0.12190829087540528,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1311248540878296,
      "kl": 0.004332931013777852,
      "learning_rate": 9.756276053728577e-07,
      "loss": -0.0128,
      "num_tokens": 72304950.0,
      "reward": 0.9241602420806885,
      "reward_std": 0.10709454119205475,
      "rewards/reward_func/mean": 0.9241602420806885,
      "rewards/reward_func/std": 0.10709454119205475,
      "step": 2632,
      "step_time": 15.393566634505987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 279.0,
      "completions/max_terminated_length": 279.0,
      "completions/mean_length": 227.3125,
      "completions/mean_terminated_length": 227.3125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.38326458632946014,
      "epoch": 0.12195460861509959,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005797903053462505,
      "kl": 0.004306967603042722,
      "learning_rate": 9.756183418249188e-07,
      "loss": 0.0002,
      "num_tokens": 72334171.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2633,
      "step_time": 24.80897504463792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 154.0,
      "completions/mean_terminated_length": 154.0,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3198411762714386,
      "epoch": 0.12200092635479388,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007689288817346096,
      "kl": 0.0034146554535254836,
      "learning_rate": 9.756090782769801e-07,
      "loss": 0.0002,
      "num_tokens": 72369195.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2634,
      "step_time": 19.71334460005164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 192.375,
      "completions/mean_terminated_length": 192.375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.27632561698555946,
      "epoch": 0.1220472440944882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10215043276548386,
      "kl": 0.005216853343881667,
      "learning_rate": 9.755998147290412e-07,
      "loss": -0.0255,
      "num_tokens": 72391185.0,
      "reward": 0.5789077281951904,
      "reward_std": 0.3379634916782379,
      "rewards/reward_func/mean": 0.5789077281951904,
      "rewards/reward_func/std": 0.3379634916782379,
      "step": 2635,
      "step_time": 21.428825974464417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 140.625,
      "completions/mean_terminated_length": 140.625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3496779501438141,
      "epoch": 0.12209356183418249,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00287781935185194,
      "kl": 0.0020887544378638268,
      "learning_rate": 9.755905511811024e-07,
      "loss": 0.0001,
      "num_tokens": 72414331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2636,
      "step_time": 15.327082812786102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 200.625,
      "completions/mean_terminated_length": 200.625,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.24473798647522926,
      "epoch": 0.1221398795738768,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0794215276837349,
      "kl": 0.005211790441535413,
      "learning_rate": 9.755812876331635e-07,
      "loss": -0.0117,
      "num_tokens": 72435925.0,
      "reward": 0.9945688247680664,
      "reward_std": 0.021724820137023926,
      "rewards/reward_func/mean": 0.9945688247680664,
      "rewards/reward_func/std": 0.021724820137023926,
      "step": 2637,
      "step_time": 20.33620259165764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 139.3125,
      "completions/mean_terminated_length": 139.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.23275908082723618,
      "epoch": 0.1221861973135711,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002283212961629033,
      "kl": 0.0019031875126529485,
      "learning_rate": 9.755720240852246e-07,
      "loss": 0.0001,
      "num_tokens": 72455546.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2638,
      "step_time": 15.039844371378422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 189.3125,
      "completions/mean_terminated_length": 189.3125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.2202717587351799,
      "epoch": 0.1222325150532654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003921009600162506,
      "kl": 0.0030107529601082206,
      "learning_rate": 9.755627605372857e-07,
      "loss": 0.0002,
      "num_tokens": 72480735.0,
      "reward": 0.9000876545906067,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9000876545906067,
      "rewards/reward_func/std": 0.0,
      "step": 2639,
      "step_time": 20.762874558568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 154.6875,
      "completions/mean_terminated_length": 154.6875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.1303444765508175,
      "epoch": 0.1222788327929597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010071864817291498,
      "kl": 0.0007015015071374364,
      "learning_rate": 9.755534969893469e-07,
      "loss": 0.0,
      "num_tokens": 72513690.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 2640,
      "step_time": 19.09148909151554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.162997767329216,
      "epoch": 0.12232515053265401,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00249607115983963,
      "kl": 0.0019170307787135243,
      "learning_rate": 9.75544233441408e-07,
      "loss": 0.0001,
      "num_tokens": 72537901.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2641,
      "step_time": 17.687778558582067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 110.25,
      "completions/mean_terminated_length": 110.25,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2281123660504818,
      "epoch": 0.12237146827234831,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002711585024371743,
      "kl": 0.0018292743479833007,
      "learning_rate": 9.755349698934691e-07,
      "loss": 0.0001,
      "num_tokens": 72557601.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2642,
      "step_time": 13.201411411166191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 132.0,
      "completions/mean_terminated_length": 132.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2339770272374153,
      "epoch": 0.12241778601204262,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008354433812201023,
      "kl": 0.004901603446342051,
      "learning_rate": 9.755257063455302e-07,
      "loss": 0.0002,
      "num_tokens": 72577185.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2643,
      "step_time": 14.315678246319294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 163.1875,
      "completions/mean_terminated_length": 163.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.37413283437490463,
      "epoch": 0.12246410375173691,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002193914959207177,
      "kl": 0.002328070200746879,
      "learning_rate": 9.755164427975916e-07,
      "loss": 0.0001,
      "num_tokens": 72609316.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2644,
      "step_time": 18.856964860111475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 291.0,
      "completions/max_terminated_length": 291.0,
      "completions/mean_length": 187.25,
      "completions/mean_terminated_length": 187.25,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.36122526973485947,
      "epoch": 0.12251042149143122,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1051226556301117,
      "kl": 0.007154420600272715,
      "learning_rate": 9.755071792496525e-07,
      "loss": -0.1066,
      "num_tokens": 72630632.0,
      "reward": 0.09804060310125351,
      "reward_std": 0.26789790391921997,
      "rewards/reward_func/mean": 0.09804060310125351,
      "rewards/reward_func/std": 0.26789796352386475,
      "step": 2645,
      "step_time": 24.785052228718996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 180.625,
      "completions/mean_terminated_length": 180.625,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.18327955156564713,
      "epoch": 0.12255673923112552,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001266221865080297,
      "kl": 0.0010286837787134573,
      "learning_rate": 9.754979157017136e-07,
      "loss": 0.0001,
      "num_tokens": 72663506.0,
      "reward": 0.8668779134750366,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8668779134750366,
      "rewards/reward_func/std": 0.0,
      "step": 2646,
      "step_time": 21.732679691165686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.23567169904708862,
      "epoch": 0.12260305697081983,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09320887923240662,
      "kl": 0.002246777032269165,
      "learning_rate": 9.754886521537747e-07,
      "loss": -0.0223,
      "num_tokens": 72688000.0,
      "reward": 0.24785666167736053,
      "reward_std": 0.013929652981460094,
      "rewards/reward_func/mean": 0.24785666167736053,
      "rewards/reward_func/std": 0.013929653912782669,
      "step": 2647,
      "step_time": 19.900317683815956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 194.3125,
      "completions/mean_terminated_length": 194.3125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.21999873220920563,
      "epoch": 0.12264937471051413,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13728246092796326,
      "kl": 0.004719353746622801,
      "learning_rate": 9.75479388605836e-07,
      "loss": -0.0452,
      "num_tokens": 72709285.0,
      "reward": 0.9377368092536926,
      "reward_std": 0.2490527629852295,
      "rewards/reward_func/mean": 0.9377368092536926,
      "rewards/reward_func/std": 0.2490527629852295,
      "step": 2648,
      "step_time": 20.027044255286455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.0625,
      "completions/mean_terminated_length": 179.0625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.20912526175379753,
      "epoch": 0.12269569245020844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003093697363510728,
      "kl": 0.0020216996781527996,
      "learning_rate": 9.754701250578972e-07,
      "loss": 0.0001,
      "num_tokens": 72742998.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2649,
      "step_time": 22.07695698738098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 189.375,
      "completions/mean_terminated_length": 189.375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.33261389285326004,
      "epoch": 0.12274201018990273,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024126721546053886,
      "kl": 0.0023820858041290194,
      "learning_rate": 9.754608615099583e-07,
      "loss": 0.0001,
      "num_tokens": 72783276.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2650,
      "step_time": 24.320692989975214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 125.0625,
      "completions/mean_terminated_length": 125.0625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3045237138867378,
      "epoch": 0.12278832792959704,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012520633172243834,
      "kl": 0.0013071002904325724,
      "learning_rate": 9.754515979620195e-07,
      "loss": 0.0001,
      "num_tokens": 72807533.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2651,
      "step_time": 15.852440193295479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 218.4375,
      "completions/mean_terminated_length": 218.4375,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.15901795402169228,
      "epoch": 0.12283464566929134,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003128107637166977,
      "kl": 0.0022057093447074294,
      "learning_rate": 9.754423344140806e-07,
      "loss": 0.0001,
      "num_tokens": 72832084.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2652,
      "step_time": 21.951898373663425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 134.6875,
      "completions/mean_terminated_length": 134.6875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.30614544451236725,
      "epoch": 0.12288096340898565,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032622588332742453,
      "kl": 0.0024048115592449903,
      "learning_rate": 9.754330708661417e-07,
      "loss": 0.0001,
      "num_tokens": 72853919.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2653,
      "step_time": 15.294451046735048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 185.1875,
      "completions/mean_terminated_length": 185.1875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.4151734560728073,
      "epoch": 0.12292728114867994,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01059812679886818,
      "kl": 0.004364458902273327,
      "learning_rate": 9.754238073182028e-07,
      "loss": 0.0002,
      "num_tokens": 72878610.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2654,
      "step_time": 20.326975125819445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 288.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 248.8125,
      "completions/mean_terminated_length": 248.8125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.21510307118296623,
      "epoch": 0.12297359888837425,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09212217479944229,
      "kl": 0.007700381334871054,
      "learning_rate": 9.75414543770264e-07,
      "loss": -0.0667,
      "num_tokens": 72906239.0,
      "reward": 0.8717219829559326,
      "reward_std": 0.137044757604599,
      "rewards/reward_func/mean": 0.8717219829559326,
      "rewards/reward_func/std": 0.137044757604599,
      "step": 2655,
      "step_time": 25.168050318956375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 139.8125,
      "completions/mean_terminated_length": 139.8125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.17870675027370453,
      "epoch": 0.12301991662806855,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011638562427833676,
      "kl": 0.0007753275713184848,
      "learning_rate": 9.75405280222325e-07,
      "loss": 0.0,
      "num_tokens": 72941644.0,
      "reward": 0.8007373809814453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8007373809814453,
      "rewards/reward_func/std": 0.0,
      "step": 2656,
      "step_time": 18.49212707579136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 168.4375,
      "completions/mean_terminated_length": 168.4375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.27530380338430405,
      "epoch": 0.12306623436776286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025134237948805094,
      "kl": 0.0016946768737398088,
      "learning_rate": 9.753960166743862e-07,
      "loss": 0.0001,
      "num_tokens": 72966051.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2657,
      "step_time": 17.530892979353666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 117.75,
      "completions/mean_terminated_length": 117.75,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.30782726407051086,
      "epoch": 0.12311255210745715,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004912033211439848,
      "kl": 0.002673172624781728,
      "learning_rate": 9.753867531264473e-07,
      "loss": 0.0001,
      "num_tokens": 72987167.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2658,
      "step_time": 13.192748345434666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 127.375,
      "completions/mean_terminated_length": 127.375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.32113301008939743,
      "epoch": 0.12315886984715146,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024710255675017834,
      "kl": 0.0023215124383568764,
      "learning_rate": 9.753774895785085e-07,
      "loss": 0.0001,
      "num_tokens": 73015877.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2659,
      "step_time": 15.505543787032366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 149.0,
      "completions/mean_terminated_length": 149.0,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.23406724631786346,
      "epoch": 0.12320518758684576,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004036585334688425,
      "kl": 0.002077404933515936,
      "learning_rate": 9.753682260305696e-07,
      "loss": 0.0001,
      "num_tokens": 73035989.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2660,
      "step_time": 15.954175382852554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 136.875,
      "completions/mean_terminated_length": 136.875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3086076006293297,
      "epoch": 0.12325150532654007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027811750769615173,
      "kl": 0.002006874361541122,
      "learning_rate": 9.75358962482631e-07,
      "loss": 0.0001,
      "num_tokens": 73060579.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2661,
      "step_time": 16.9653382524848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 134.375,
      "completions/mean_terminated_length": 134.375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.26886285096406937,
      "epoch": 0.12329782306623437,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034270649775862694,
      "kl": 0.0019199447997380048,
      "learning_rate": 9.75349698934692e-07,
      "loss": 0.0001,
      "num_tokens": 73080089.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2662,
      "step_time": 13.628558360040188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 157.0,
      "completions/mean_terminated_length": 157.0,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.2314070351421833,
      "epoch": 0.12334414080592868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002566870767623186,
      "kl": 0.001634041196666658,
      "learning_rate": 9.75340435386753e-07,
      "loss": 0.0001,
      "num_tokens": 73105721.0,
      "reward": 0.024372844025492668,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.024372844025492668,
      "rewards/reward_func/std": 0.0,
      "step": 2663,
      "step_time": 16.70699230581522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 179.625,
      "completions/mean_terminated_length": 179.625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.33533792942762375,
      "epoch": 0.12339045854562297,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024146882351487875,
      "kl": 0.0016336151165887713,
      "learning_rate": 9.753311718388143e-07,
      "loss": 0.0001,
      "num_tokens": 73135939.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2664,
      "step_time": 20.23602031543851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 468.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 266.9375,
      "completions/mean_terminated_length": 266.9375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.4119087755680084,
      "epoch": 0.12343677628531728,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07697068154811859,
      "kl": 0.004283018934074789,
      "learning_rate": 9.753219082908754e-07,
      "loss": -0.044,
      "num_tokens": 73162658.0,
      "reward": 0.06978751718997955,
      "reward_std": 0.24942469596862793,
      "rewards/reward_func/mean": 0.06978751718997955,
      "rewards/reward_func/std": 0.24942469596862793,
      "step": 2665,
      "step_time": 37.28246930614114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 193.4375,
      "completions/mean_terminated_length": 193.4375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.23821564763784409,
      "epoch": 0.12348309402501158,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08502568304538727,
      "kl": 0.006176152848638594,
      "learning_rate": 9.753126447429365e-07,
      "loss": -0.0676,
      "num_tokens": 73201129.0,
      "reward": 0.08006329834461212,
      "reward_std": 0.1994636505842209,
      "rewards/reward_func/mean": 0.08006329834461212,
      "rewards/reward_func/std": 0.19946368038654327,
      "step": 2666,
      "step_time": 25.550567347556353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 151.4375,
      "completions/mean_terminated_length": 151.4375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3212057426571846,
      "epoch": 0.12352941176470589,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021431755740195513,
      "kl": 0.0019139970827382058,
      "learning_rate": 9.753033811949977e-07,
      "loss": 0.0001,
      "num_tokens": 73230832.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2667,
      "step_time": 17.31324952840805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 154.4375,
      "completions/mean_terminated_length": 154.4375,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.46347784250974655,
      "epoch": 0.12357572950440018,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002197732450440526,
      "kl": 0.001981659181183204,
      "learning_rate": 9.752941176470588e-07,
      "loss": 0.0001,
      "num_tokens": 73273367.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2668,
      "step_time": 23.228794887661934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 156.4375,
      "completions/mean_terminated_length": 156.4375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.1714119277894497,
      "epoch": 0.1236220472440945,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12525725364685059,
      "kl": 0.002197854861151427,
      "learning_rate": 9.7528485409912e-07,
      "loss": 0.004,
      "num_tokens": 73300238.0,
      "reward": 0.44468438625335693,
      "reward_std": 0.02321789413690567,
      "rewards/reward_func/mean": 0.44468438625335693,
      "rewards/reward_func/std": 0.02321789413690567,
      "step": 2669,
      "step_time": 17.755487963557243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 124.6875,
      "completions/mean_terminated_length": 124.6875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3024565950036049,
      "epoch": 0.12366836498378879,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014723469503223896,
      "kl": 0.0016822517500258982,
      "learning_rate": 9.75275590551181e-07,
      "loss": 0.0001,
      "num_tokens": 73320505.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2670,
      "step_time": 13.288980275392532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 130.6875,
      "completions/mean_terminated_length": 130.6875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.33811046183109283,
      "epoch": 0.1237146827234831,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005108274519443512,
      "kl": 0.003483735374175012,
      "learning_rate": 9.752663270032422e-07,
      "loss": 0.0002,
      "num_tokens": 73341844.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2671,
      "step_time": 14.610954966396093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 422.0,
      "completions/max_terminated_length": 422.0,
      "completions/mean_length": 255.5625,
      "completions/mean_terminated_length": 255.5625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.43313534557819366,
      "epoch": 0.1237610004631774,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10161624103784561,
      "kl": 0.0060406067641451955,
      "learning_rate": 9.752570634553033e-07,
      "loss": 0.0048,
      "num_tokens": 73367805.0,
      "reward": 0.0008797428454272449,
      "reward_std": 0.0013829093659296632,
      "rewards/reward_func/mean": 0.0008797428454272449,
      "rewards/reward_func/std": 0.001382909482344985,
      "step": 2672,
      "step_time": 35.404101356863976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 180.25,
      "completions/mean_terminated_length": 180.25,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.38385750353336334,
      "epoch": 0.1238073182028717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034687796141952276,
      "kl": 0.0028997634071856737,
      "learning_rate": 9.752477999073644e-07,
      "loss": 0.0001,
      "num_tokens": 73400913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2673,
      "step_time": 20.9024547226727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 166.875,
      "completions/mean_terminated_length": 166.875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3986580818891525,
      "epoch": 0.123853635942566,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015908145578578115,
      "kl": 0.0015960049058776349,
      "learning_rate": 9.752385363594258e-07,
      "loss": 0.0001,
      "num_tokens": 73439727.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2674,
      "step_time": 22.429946400225163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 187.375,
      "completions/mean_terminated_length": 187.375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.19064150750637054,
      "epoch": 0.12389995368226031,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00739160506054759,
      "kl": 0.0064025112660601735,
      "learning_rate": 9.752292728114867e-07,
      "loss": 0.0003,
      "num_tokens": 73461829.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2675,
      "step_time": 19.385738972574472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 164.5,
      "completions/mean_terminated_length": 164.5,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3214581310749054,
      "epoch": 0.1239462714219546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005774408113211393,
      "kl": 0.004261001828126609,
      "learning_rate": 9.752200092635478e-07,
      "loss": 0.0002,
      "num_tokens": 73482093.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2676,
      "step_time": 17.04783011227846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 156.125,
      "completions/mean_terminated_length": 156.125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2922755032777786,
      "epoch": 0.12399258916164892,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009557011537253857,
      "kl": 0.006010084645822644,
      "learning_rate": 9.75210745715609e-07,
      "loss": 0.0003,
      "num_tokens": 73502575.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2677,
      "step_time": 15.541265804320574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 219.875,
      "completions/mean_terminated_length": 219.875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.3992610350251198,
      "epoch": 0.12403890690134321,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10632207989692688,
      "kl": 0.0053071328438818455,
      "learning_rate": 9.752014821676703e-07,
      "loss": -0.0821,
      "num_tokens": 73537949.0,
      "reward": 0.34610211849212646,
      "reward_std": 0.4094793498516083,
      "rewards/reward_func/mean": 0.34610211849212646,
      "rewards/reward_func/std": 0.4094793200492859,
      "step": 2678,
      "step_time": 25.628438390791416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 187.375,
      "completions/mean_terminated_length": 187.375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.23493099212646484,
      "epoch": 0.12408522464103752,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11918675154447556,
      "kl": 0.012475994299165905,
      "learning_rate": 9.751922186197314e-07,
      "loss": -0.0148,
      "num_tokens": 73559283.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2679,
      "step_time": 20.169689398258924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 191.375,
      "completions/mean_terminated_length": 191.375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.24099867045879364,
      "epoch": 0.12413154238073182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.18033720552921295,
      "kl": 0.018772387644276023,
      "learning_rate": 9.751829550717925e-07,
      "loss": 0.0011,
      "num_tokens": 73593225.0,
      "reward": 0.3162277638912201,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3162277638912201,
      "rewards/reward_func/std": 0.0,
      "step": 2680,
      "step_time": 25.221865363419056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 184.375,
      "completions/mean_terminated_length": 184.375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.36500103771686554,
      "epoch": 0.12417786012042613,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018618119647726417,
      "kl": 0.0018729929579421878,
      "learning_rate": 9.751736915238536e-07,
      "loss": 0.0001,
      "num_tokens": 73626159.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2681,
      "step_time": 23.625973116606474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 113.3125,
      "completions/mean_terminated_length": 113.3125,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 0.3030073195695877,
      "epoch": 0.12422417786012042,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038442956283688545,
      "kl": 0.002038826758507639,
      "learning_rate": 9.751644279759148e-07,
      "loss": 0.0001,
      "num_tokens": 73647604.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2682,
      "step_time": 13.847161881625652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 180.0625,
      "completions/mean_terminated_length": 180.0625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.3768431022763252,
      "epoch": 0.12427049559981473,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017164384480565786,
      "kl": 0.0014914613857399672,
      "learning_rate": 9.751551644279759e-07,
      "loss": 0.0001,
      "num_tokens": 73679717.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2683,
      "step_time": 25.22353618964553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 174.3125,
      "completions/mean_terminated_length": 174.3125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.24410822242498398,
      "epoch": 0.12431681333950903,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01254366897046566,
      "kl": 0.008021327201277018,
      "learning_rate": 9.75145900880037e-07,
      "loss": 0.0004,
      "num_tokens": 73701018.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2684,
      "step_time": 18.110782250761986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 188.75,
      "completions/mean_terminated_length": 188.75,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.26619939506053925,
      "epoch": 0.12436313107920334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031394518446177244,
      "kl": 0.0026092990301549435,
      "learning_rate": 9.751366373320981e-07,
      "loss": 0.0001,
      "num_tokens": 73735654.0,
      "reward": 0.0949237272143364,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0949237272143364,
      "rewards/reward_func/std": 0.0,
      "step": 2685,
      "step_time": 21.56949655711651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 287.625,
      "completions/mean_terminated_length": 287.625,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "entropy": 0.31667467951774597,
      "epoch": 0.12440944881889764,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07866465300321579,
      "kl": 0.0056940873619169,
      "learning_rate": 9.751273737841593e-07,
      "loss": -0.0308,
      "num_tokens": 73769792.0,
      "reward": 0.786245584487915,
      "reward_std": 0.20967189967632294,
      "rewards/reward_func/mean": 0.786245584487915,
      "rewards/reward_func/std": 0.20967191457748413,
      "step": 2686,
      "step_time": 33.06748655810952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 157.9375,
      "completions/mean_terminated_length": 157.9375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.45203348994255066,
      "epoch": 0.12445576655859195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024263125378638506,
      "kl": 0.002125900180544704,
      "learning_rate": 9.751181102362206e-07,
      "loss": 0.0001,
      "num_tokens": 73808991.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2687,
      "step_time": 20.177612725645304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 177.0,
      "completions/mean_terminated_length": 177.0,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.19452517479658127,
      "epoch": 0.12450208429828624,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014004954136908054,
      "kl": 0.0011580679565668106,
      "learning_rate": 9.751088466882815e-07,
      "loss": 0.0001,
      "num_tokens": 73832031.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 2688,
      "step_time": 19.230411875993013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 149.5625,
      "completions/mean_terminated_length": 149.5625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.310242123901844,
      "epoch": 0.12454840203798055,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003193320706486702,
      "kl": 0.002363665000302717,
      "learning_rate": 9.750995831403426e-07,
      "loss": 0.0001,
      "num_tokens": 73858328.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2689,
      "step_time": 17.874194260686636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 122.0625,
      "completions/mean_terminated_length": 122.0625,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2829183302819729,
      "epoch": 0.12459471977767485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00409874739125371,
      "kl": 0.002622196276206523,
      "learning_rate": 9.750903195924037e-07,
      "loss": 0.0001,
      "num_tokens": 73877753.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2690,
      "step_time": 13.463900413364172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 330.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 290.8125,
      "completions/mean_terminated_length": 290.8125,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "entropy": 0.211647417396307,
      "epoch": 0.12464103751736916,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09355708211660385,
      "kl": 0.008499447256326675,
      "learning_rate": 9.75081056044465e-07,
      "loss": -0.0077,
      "num_tokens": 73902870.0,
      "reward": 0.9208402633666992,
      "reward_std": 0.0006037076818756759,
      "rewards/reward_func/mean": 0.9208402633666992,
      "rewards/reward_func/std": 0.0006037129205651581,
      "step": 2691,
      "step_time": 27.374474808573723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 203.375,
      "completions/mean_terminated_length": 203.375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.3093295618891716,
      "epoch": 0.12468735525706345,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13858255743980408,
      "kl": 0.005437292158603668,
      "learning_rate": 9.750717924965262e-07,
      "loss": -0.0444,
      "num_tokens": 73927404.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2692,
      "step_time": 21.022662118077278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 124.125,
      "completions/mean_terminated_length": 124.125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.20365708321332932,
      "epoch": 0.12473367299675776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003993342164903879,
      "kl": 0.0024475062964484096,
      "learning_rate": 9.750625289485873e-07,
      "loss": 0.0001,
      "num_tokens": 73946686.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2693,
      "step_time": 12.865562237799168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 219.9375,
      "completions/mean_terminated_length": 219.9375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.23746725171804428,
      "epoch": 0.12477999073645206,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0872751995921135,
      "kl": 0.005896298564039171,
      "learning_rate": 9.750532654006485e-07,
      "loss": 0.0241,
      "num_tokens": 73982685.0,
      "reward": 0.7184863090515137,
      "reward_std": 0.30690306425094604,
      "rewards/reward_func/mean": 0.7184863090515137,
      "rewards/reward_func/std": 0.30690306425094604,
      "step": 2694,
      "step_time": 24.768761537969112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 174.4375,
      "completions/mean_terminated_length": 174.4375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3785416856408119,
      "epoch": 0.12482630847614637,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004713712725788355,
      "kl": 0.003453952958807349,
      "learning_rate": 9.750440018527096e-07,
      "loss": 0.0002,
      "num_tokens": 74004596.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2695,
      "step_time": 18.08088992908597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 134.5625,
      "completions/mean_terminated_length": 134.5625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.31142277270555496,
      "epoch": 0.12487262621584067,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019075494492426515,
      "kl": 0.0015639386547263712,
      "learning_rate": 9.750347383047707e-07,
      "loss": 0.0001,
      "num_tokens": 74025037.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2696,
      "step_time": 14.804278913885355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 137.625,
      "completions/mean_terminated_length": 137.625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.30009637773036957,
      "epoch": 0.12491894395553498,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003276183269917965,
      "kl": 0.0020019312505610287,
      "learning_rate": 9.750254747568318e-07,
      "loss": 0.0001,
      "num_tokens": 74045639.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2697,
      "step_time": 15.823785934597254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 121.0,
      "completions/mean_terminated_length": 121.0,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.23315386101603508,
      "epoch": 0.12496526169522927,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009364706929773092,
      "kl": 0.001000961783574894,
      "learning_rate": 9.75016211208893e-07,
      "loss": 0.0001,
      "num_tokens": 74069127.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2698,
      "step_time": 13.778093438595533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 140.375,
      "completions/mean_terminated_length": 140.375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.33924783021211624,
      "epoch": 0.12501157943492358,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004711588844656944,
      "kl": 0.0026648000348359346,
      "learning_rate": 9.75006947660954e-07,
      "loss": 0.0001,
      "num_tokens": 74092413.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2699,
      "step_time": 16.340378548949957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 166.1875,
      "completions/mean_terminated_length": 166.1875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.44041479378938675,
      "epoch": 0.12505789717461788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017850829754024744,
      "kl": 0.0020732416305691004,
      "learning_rate": 9.749976841130152e-07,
      "loss": 0.0001,
      "num_tokens": 74144048.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2700,
      "step_time": 24.832873705774546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 130.5625,
      "completions/mean_terminated_length": 130.5625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3139389604330063,
      "epoch": 0.12510421491431217,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002681613201275468,
      "kl": 0.00191219849511981,
      "learning_rate": 9.749884205650763e-07,
      "loss": 0.0001,
      "num_tokens": 74165929.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2701,
      "step_time": 14.726565402001143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 135.625,
      "completions/mean_terminated_length": 135.625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.25517112016677856,
      "epoch": 0.1251505326540065,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0047874790616333485,
      "kl": 0.0021214752341620624,
      "learning_rate": 9.749791570171375e-07,
      "loss": 0.0001,
      "num_tokens": 74189331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2702,
      "step_time": 15.88565081730485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.2613317035138607,
      "epoch": 0.1251968503937008,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034698653034865856,
      "kl": 0.0033956827828660607,
      "learning_rate": 9.749698934691986e-07,
      "loss": 0.0002,
      "num_tokens": 74211765.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 2703,
      "step_time": 18.23936701565981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 176.0,
      "completions/mean_terminated_length": 176.0,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.4038449004292488,
      "epoch": 0.1252431681333951,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005769040901213884,
      "kl": 0.0036217946326360106,
      "learning_rate": 9.7496062992126e-07,
      "loss": 0.0002,
      "num_tokens": 74245845.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2704,
      "step_time": 20.69897275790572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 141.9375,
      "completions/mean_terminated_length": 141.9375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.2649060934782028,
      "epoch": 0.12528948587308938,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013550998410210013,
      "kl": 0.0011289288086118177,
      "learning_rate": 9.74951366373321e-07,
      "loss": 0.0001,
      "num_tokens": 74279236.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2705,
      "step_time": 18.662751965224743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 179.3125,
      "completions/mean_terminated_length": 179.3125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.35424934327602386,
      "epoch": 0.1253358036127837,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028632325120270252,
      "kl": 0.002004824666073546,
      "learning_rate": 9.74942102825382e-07,
      "loss": 0.0001,
      "num_tokens": 74320233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2706,
      "step_time": 23.352882966399193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 158.5,
      "completions/mean_terminated_length": 158.5,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3750879764556885,
      "epoch": 0.125382121352478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013110886793583632,
      "kl": 0.0014111300115473568,
      "learning_rate": 9.74932839277443e-07,
      "loss": 0.0001,
      "num_tokens": 74354417.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2707,
      "step_time": 19.81010114774108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 227.25,
      "completions/mean_terminated_length": 227.25,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.19847723096609116,
      "epoch": 0.1254284390921723,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008015139028429985,
      "kl": 0.00550473207840696,
      "learning_rate": 9.749235757295044e-07,
      "loss": 0.0003,
      "num_tokens": 74380533.0,
      "reward": 0.9775290489196777,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9775290489196777,
      "rewards/reward_func/std": 0.0,
      "step": 2708,
      "step_time": 21.945704523473978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 198.9375,
      "completions/mean_terminated_length": 198.9375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4454718679189682,
      "epoch": 0.1254747568318666,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004729503765702248,
      "kl": 0.003273056587204337,
      "learning_rate": 9.749143121815655e-07,
      "loss": 0.0002,
      "num_tokens": 74406804.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2709,
      "step_time": 20.904861342161894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2619287669658661,
      "epoch": 0.12552107457156092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012338546803221107,
      "kl": 0.0011804668611148372,
      "learning_rate": 9.749050486336267e-07,
      "loss": 0.0001,
      "num_tokens": 74434381.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2710,
      "step_time": 15.598942276090384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 123.25,
      "completions/mean_terminated_length": 123.25,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.28496143221855164,
      "epoch": 0.12556739231125522,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021542469039559364,
      "kl": 0.001996227045310661,
      "learning_rate": 9.748957850856878e-07,
      "loss": 0.0001,
      "num_tokens": 74454577.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2711,
      "step_time": 15.838134922087193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2400508113205433,
      "epoch": 0.1256137100509495,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006706647574901581,
      "kl": 0.004638098063878715,
      "learning_rate": 9.74886521537749e-07,
      "loss": 0.0002,
      "num_tokens": 74475914.0,
      "reward": 0.8242367506027222,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8242367506027222,
      "rewards/reward_func/std": 0.0,
      "step": 2712,
      "step_time": 17.55512172728777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 191.3125,
      "completions/mean_terminated_length": 191.3125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.23226720094680786,
      "epoch": 0.1256600277906438,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018562600016593933,
      "kl": 0.0015250964497681707,
      "learning_rate": 9.7487725798981e-07,
      "loss": 0.0001,
      "num_tokens": 74499311.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2713,
      "step_time": 19.46345605701208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 199.4375,
      "completions/mean_terminated_length": 199.4375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.43358640372753143,
      "epoch": 0.12570634553033813,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08696886897087097,
      "kl": 0.003077430708799511,
      "learning_rate": 9.748679944418712e-07,
      "loss": 0.0547,
      "num_tokens": 74525078.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2714,
      "step_time": 23.7141883186996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 163.75,
      "completions/mean_terminated_length": 163.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.3527919724583626,
      "epoch": 0.12575266327003243,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006981628946959972,
      "kl": 0.0041662033763714135,
      "learning_rate": 9.748587308939323e-07,
      "loss": 0.0002,
      "num_tokens": 74551490.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2715,
      "step_time": 20.072610300034285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 156.25,
      "completions/mean_terminated_length": 156.25,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.35750482976436615,
      "epoch": 0.12579898100972672,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005137501284480095,
      "kl": 0.0038633313379250467,
      "learning_rate": 9.748494673459934e-07,
      "loss": 0.0002,
      "num_tokens": 74571638.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2716,
      "step_time": 16.836354076862335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 157.9375,
      "completions/mean_terminated_length": 157.9375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.42012248933315277,
      "epoch": 0.12584529874942102,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003115291241556406,
      "kl": 0.002565705159213394,
      "learning_rate": 9.748402037980545e-07,
      "loss": 0.0001,
      "num_tokens": 74592533.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2717,
      "step_time": 16.934609431773424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 194.625,
      "completions/mean_terminated_length": 194.625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3938866928219795,
      "epoch": 0.12589161648911534,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11733521521091461,
      "kl": 0.005466120084747672,
      "learning_rate": 9.748309402501157e-07,
      "loss": -0.0075,
      "num_tokens": 74630127.0,
      "reward": 0.20493784546852112,
      "reward_std": 0.3782358467578888,
      "rewards/reward_func/mean": 0.20493784546852112,
      "rewards/reward_func/std": 0.3782358765602112,
      "step": 2718,
      "step_time": 27.114779822528362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 130.125,
      "completions/mean_terminated_length": 130.125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.24157185852527618,
      "epoch": 0.12593793422880964,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003785189939662814,
      "kl": 0.0017986957682296634,
      "learning_rate": 9.748216767021768e-07,
      "loss": 0.0001,
      "num_tokens": 74649697.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2719,
      "step_time": 14.310607746243477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 119.5,
      "completions/mean_terminated_length": 119.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.31972789764404297,
      "epoch": 0.12598425196850394,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024709198623895645,
      "kl": 0.0017789184930734336,
      "learning_rate": 9.74812413154238e-07,
      "loss": 0.0001,
      "num_tokens": 74672169.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2720,
      "step_time": 13.845789514482021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 175.25,
      "completions/mean_terminated_length": 175.25,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.26165006309747696,
      "epoch": 0.12603056970819823,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035965414717793465,
      "kl": 0.002663189312443137,
      "learning_rate": 9.748031496062993e-07,
      "loss": 0.0001,
      "num_tokens": 74693933.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2721,
      "step_time": 17.649345494806767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 121.5,
      "completions/mean_terminated_length": 121.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2825714349746704,
      "epoch": 0.12607688744789255,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00237577548250556,
      "kl": 0.0020054624474141747,
      "learning_rate": 9.747938860583604e-07,
      "loss": 0.0001,
      "num_tokens": 74720757.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2722,
      "step_time": 14.613527905195951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 143.6875,
      "completions/mean_terminated_length": 143.6875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.22826269268989563,
      "epoch": 0.12612320518758685,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00878122914582491,
      "kl": 0.00345516212109942,
      "learning_rate": 9.747846225104215e-07,
      "loss": 0.0002,
      "num_tokens": 74743328.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 2723,
      "step_time": 16.88019995391369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 144.25,
      "completions/mean_terminated_length": 144.25,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.29919153451919556,
      "epoch": 0.12616952292728115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005003768019378185,
      "kl": 0.0022477228776551783,
      "learning_rate": 9.747753589624826e-07,
      "loss": 0.0001,
      "num_tokens": 74779412.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2724,
      "step_time": 18.6339902728796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 123.25,
      "completions/mean_terminated_length": 123.25,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.24651046097278595,
      "epoch": 0.12621584066697544,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004378539510071278,
      "kl": 0.0019843547779601067,
      "learning_rate": 9.747660954145438e-07,
      "loss": 0.0001,
      "num_tokens": 74798776.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2725,
      "step_time": 13.920933213084936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 138.3125,
      "completions/mean_terminated_length": 138.3125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.26773545891046524,
      "epoch": 0.12626215840666977,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020556682720780373,
      "kl": 0.0015064789913594723,
      "learning_rate": 9.747568318666049e-07,
      "loss": 0.0001,
      "num_tokens": 74820461.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2726,
      "step_time": 16.31049646437168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 150.1875,
      "completions/mean_terminated_length": 150.1875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3176315873861313,
      "epoch": 0.12630847614636406,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022769500501453876,
      "kl": 0.0014689369418192655,
      "learning_rate": 9.74747568318666e-07,
      "loss": 0.0001,
      "num_tokens": 74851584.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2727,
      "step_time": 18.583209179341793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 127.25,
      "completions/mean_terminated_length": 127.25,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3354658707976341,
      "epoch": 0.12635479388605836,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017642880557104945,
      "kl": 0.0014925982104614377,
      "learning_rate": 9.747383047707271e-07,
      "loss": 0.0001,
      "num_tokens": 74878756.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2728,
      "step_time": 15.775880549103022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 303.0,
      "completions/max_terminated_length": 303.0,
      "completions/mean_length": 205.875,
      "completions/mean_terminated_length": 205.875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.38607513904571533,
      "epoch": 0.12640111162575265,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12646599113941193,
      "kl": 0.0063538794638589025,
      "learning_rate": 9.747290412227883e-07,
      "loss": -0.164,
      "num_tokens": 74901282.0,
      "reward": 0.38645732402801514,
      "reward_std": 0.4928838312625885,
      "rewards/reward_func/mean": 0.38645732402801514,
      "rewards/reward_func/std": 0.4928838610649109,
      "step": 2729,
      "step_time": 24.98566211387515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 193.25,
      "completions/mean_terminated_length": 193.25,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.2186572328209877,
      "epoch": 0.12644742936544698,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10843881964683533,
      "kl": 0.0016772676608525217,
      "learning_rate": 9.747197776748494e-07,
      "loss": -0.019,
      "num_tokens": 74931030.0,
      "reward": 0.9276199340820312,
      "reward_std": 0.050811730325222015,
      "rewards/reward_func/mean": 0.9276199340820312,
      "rewards/reward_func/std": 0.05081172287464142,
      "step": 2730,
      "step_time": 20.78960970044136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 189.25,
      "completions/mean_terminated_length": 189.25,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2189238965511322,
      "epoch": 0.12649374710514127,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11478737741708755,
      "kl": 0.002773190673906356,
      "learning_rate": 9.747105141269105e-07,
      "loss": -0.0107,
      "num_tokens": 74961258.0,
      "reward": 0.29867756366729736,
      "reward_std": 0.14011363685131073,
      "rewards/reward_func/mean": 0.29867756366729736,
      "rewards/reward_func/std": 0.14011362195014954,
      "step": 2731,
      "step_time": 20.48701937869191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 174.75,
      "completions/mean_terminated_length": 174.75,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.1975894197821617,
      "epoch": 0.12654006484483557,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14189352095127106,
      "kl": 0.0022395200503524393,
      "learning_rate": 9.747012505789716e-07,
      "loss": 0.0182,
      "num_tokens": 74991046.0,
      "reward": 0.9431997537612915,
      "reward_std": 0.015146732330322266,
      "rewards/reward_func/mean": 0.9431997537612915,
      "rewards/reward_func/std": 0.015146732330322266,
      "step": 2732,
      "step_time": 20.003262981772423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 184.3125,
      "completions/mean_terminated_length": 184.3125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.1383097842335701,
      "epoch": 0.12658638258452987,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005424704868346453,
      "kl": 0.0015339643141487613,
      "learning_rate": 9.746919870310328e-07,
      "loss": 0.0001,
      "num_tokens": 75014795.0,
      "reward": 0.8817122578620911,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8817122578620911,
      "rewards/reward_func/std": 0.0,
      "step": 2733,
      "step_time": 19.45877280086279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 169.8125,
      "completions/mean_terminated_length": 169.8125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.23734011128544807,
      "epoch": 0.1266327003242242,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.114932119846344,
      "kl": 0.003702098852954805,
      "learning_rate": 9.74682723483094e-07,
      "loss": 0.0547,
      "num_tokens": 75036504.0,
      "reward": 0.8614631295204163,
      "reward_std": 0.23074392974376678,
      "rewards/reward_func/mean": 0.8614631295204163,
      "rewards/reward_func/std": 0.23074392974376678,
      "step": 2734,
      "step_time": 18.45914290472865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 314.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 224.3125,
      "completions/mean_terminated_length": 224.3125,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.4532436206936836,
      "epoch": 0.12667901806391849,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021933862008154392,
      "kl": 0.002340391860343516,
      "learning_rate": 9.746734599351552e-07,
      "loss": 0.0001,
      "num_tokens": 75057789.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2735,
      "step_time": 26.696069829165936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 342.0,
      "completions/max_terminated_length": 342.0,
      "completions/mean_length": 247.125,
      "completions/mean_terminated_length": 247.125,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.32509417086839676,
      "epoch": 0.12672533580361278,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07923617213964462,
      "kl": 0.005715687992051244,
      "learning_rate": 9.746641963872163e-07,
      "loss": -0.0206,
      "num_tokens": 75086783.0,
      "reward": 0.26315927505493164,
      "reward_std": 0.3508790135383606,
      "rewards/reward_func/mean": 0.26315927505493164,
      "rewards/reward_func/std": 0.3508790135383606,
      "step": 2736,
      "step_time": 28.975221525877714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 208.125,
      "completions/mean_terminated_length": 208.125,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.17860662564635277,
      "epoch": 0.12677165354330708,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12073924392461777,
      "kl": 0.005260401172563434,
      "learning_rate": 9.746549328392773e-07,
      "loss": -0.0395,
      "num_tokens": 75123793.0,
      "reward": 0.5995413064956665,
      "reward_std": 0.20553939044475555,
      "rewards/reward_func/mean": 0.5995413064956665,
      "rewards/reward_func/std": 0.20553939044475555,
      "step": 2737,
      "step_time": 23.80088046193123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 133.5625,
      "completions/mean_terminated_length": 133.5625,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3274148404598236,
      "epoch": 0.1268179712830014,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003784723812714219,
      "kl": 0.0029381680069491267,
      "learning_rate": 9.746456692913386e-07,
      "loss": 0.0001,
      "num_tokens": 75146986.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2738,
      "step_time": 14.571484304964542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 161.4375,
      "completions/mean_terminated_length": 161.4375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.34107372909784317,
      "epoch": 0.1268642890226957,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012313502840697765,
      "kl": 0.007771961740218103,
      "learning_rate": 9.746364057433997e-07,
      "loss": 0.0004,
      "num_tokens": 75173985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2739,
      "step_time": 22.20461354777217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 181.1875,
      "completions/mean_terminated_length": 181.1875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.3508002907037735,
      "epoch": 0.12691060676239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004565728362649679,
      "kl": 0.0035977655788883567,
      "learning_rate": 9.746271421954608e-07,
      "loss": 0.0002,
      "num_tokens": 75195332.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2740,
      "step_time": 19.280513919889927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 204.6875,
      "completions/mean_terminated_length": 204.6875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.44792069494724274,
      "epoch": 0.1269569245020843,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020373642910271883,
      "kl": 0.0022946663666516542,
      "learning_rate": 9.74617878647522e-07,
      "loss": 0.0001,
      "num_tokens": 75216655.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2741,
      "step_time": 22.429401483386755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 165.125,
      "completions/mean_terminated_length": 165.125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.38756484538316727,
      "epoch": 0.1270032422417786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012684179469943047,
      "kl": 0.011083966586738825,
      "learning_rate": 9.74608615099583e-07,
      "loss": 0.0005,
      "num_tokens": 75237313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2742,
      "step_time": 18.614252384752035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 320.0,
      "completions/max_terminated_length": 320.0,
      "completions/mean_length": 262.6875,
      "completions/mean_terminated_length": 262.6875,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "entropy": 0.21404609829187393,
      "epoch": 0.1270495599814729,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006447000429034233,
      "kl": 0.0054478979436680675,
      "learning_rate": 9.745993515516442e-07,
      "loss": 0.0003,
      "num_tokens": 75272684.0,
      "reward": 0.687289297580719,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.687289297580719,
      "rewards/reward_func/std": 0.0,
      "step": 2743,
      "step_time": 28.914159759879112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 157.6875,
      "completions/mean_terminated_length": 157.6875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.20668013021349907,
      "epoch": 0.1270958777211672,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00394948897883296,
      "kl": 0.0021256963373161852,
      "learning_rate": 9.745900880037053e-07,
      "loss": 0.0001,
      "num_tokens": 75293399.0,
      "reward": 0.2817692756652832,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.2817692756652832,
      "rewards/reward_func/std": 0.0,
      "step": 2744,
      "step_time": 16.90724340826273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 186.4375,
      "completions/mean_terminated_length": 186.4375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3060798943042755,
      "epoch": 0.1271421954608615,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09296601265668869,
      "kl": 0.0064141659531742334,
      "learning_rate": 9.745808244557665e-07,
      "loss": 0.056,
      "num_tokens": 75316430.0,
      "reward": 0.1687968224287033,
      "reward_std": 0.19767460227012634,
      "rewards/reward_func/mean": 0.1687968224287033,
      "rewards/reward_func/std": 0.19767460227012634,
      "step": 2745,
      "step_time": 20.155911333858967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 117.0625,
      "completions/mean_terminated_length": 117.0625,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2240525707602501,
      "epoch": 0.12718851320055582,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020976727828383446,
      "kl": 0.004377532051876187,
      "learning_rate": 9.745715609078276e-07,
      "loss": 0.0002,
      "num_tokens": 75335615.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2746,
      "step_time": 14.6689806394279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 193.75,
      "completions/mean_terminated_length": 193.75,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.39229030907154083,
      "epoch": 0.12723483094025012,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002199604408815503,
      "kl": 0.002196538494899869,
      "learning_rate": 9.745622973598887e-07,
      "loss": 0.0001,
      "num_tokens": 75392315.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2747,
      "step_time": 29.404766894876957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 185.5,
      "completions/mean_terminated_length": 185.5,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.35546116530895233,
      "epoch": 0.12728114867994442,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11322785168886185,
      "kl": 0.0035763249616138637,
      "learning_rate": 9.7455303381195e-07,
      "loss": -0.0153,
      "num_tokens": 75420019.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2748,
      "step_time": 21.715355332940817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 164.25,
      "completions/mean_terminated_length": 164.25,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.1605934351682663,
      "epoch": 0.1273274664196387,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016634712228551507,
      "kl": 0.000984200305538252,
      "learning_rate": 9.74543770264011e-07,
      "loss": 0.0,
      "num_tokens": 75454071.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 2749,
      "step_time": 18.8595955632627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 132.25,
      "completions/mean_terminated_length": 132.25,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.397569015622139,
      "epoch": 0.12737378415933304,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038190491031855345,
      "kl": 0.0030509496573358774,
      "learning_rate": 9.74534506716072e-07,
      "loss": 0.0002,
      "num_tokens": 75485019.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2750,
      "step_time": 17.797695234417915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 168.5625,
      "completions/mean_terminated_length": 168.5625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.35855329781770706,
      "epoch": 0.12742010189902733,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003678632667288184,
      "kl": 0.002897813101299107,
      "learning_rate": 9.745252431681334e-07,
      "loss": 0.0001,
      "num_tokens": 75505780.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2751,
      "step_time": 19.002348955720663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 140.3125,
      "completions/mean_terminated_length": 140.3125,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.29185565561056137,
      "epoch": 0.12746641963872163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033416072838008404,
      "kl": 0.0021536298736464232,
      "learning_rate": 9.745159796201946e-07,
      "loss": 0.0001,
      "num_tokens": 75526985.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2752,
      "step_time": 16.202499013394117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 182.6875,
      "completions/mean_terminated_length": 182.6875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.17059067636728287,
      "epoch": 0.12751273737841592,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015629567205905914,
      "kl": 0.0010609637683955953,
      "learning_rate": 9.745067160722557e-07,
      "loss": 0.0001,
      "num_tokens": 75574436.0,
      "reward": 0.9000876545906067,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9000876545906067,
      "rewards/reward_func/std": 0.0,
      "step": 2753,
      "step_time": 23.694838035851717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 171.9375,
      "completions/mean_terminated_length": 171.9375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.14076201617717743,
      "epoch": 0.12755905511811025,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001567707397043705,
      "kl": 0.0011508175230119377,
      "learning_rate": 9.744974525243168e-07,
      "loss": 0.0001,
      "num_tokens": 75595683.0,
      "reward": 0.9555630087852478,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9555630087852478,
      "rewards/reward_func/std": 0.0,
      "step": 2754,
      "step_time": 16.34590784087777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 157.3125,
      "completions/mean_terminated_length": 157.3125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.4654070883989334,
      "epoch": 0.12760537285780454,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027254438027739525,
      "kl": 0.0026476834318600595,
      "learning_rate": 9.74488188976378e-07,
      "loss": 0.0001,
      "num_tokens": 75630792.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2755,
      "step_time": 20.714604150503874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 215.8125,
      "completions/mean_terminated_length": 215.8125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.22874604910612106,
      "epoch": 0.12765169059749884,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09279012680053711,
      "kl": 0.007919730502180755,
      "learning_rate": 9.74478925428439e-07,
      "loss": -0.0144,
      "num_tokens": 75652917.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 2756,
      "step_time": 20.54040264710784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 124.1875,
      "completions/mean_terminated_length": 124.1875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2796595022082329,
      "epoch": 0.12769800833719314,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001744032371789217,
      "kl": 0.0013569175789598376,
      "learning_rate": 9.744696618805002e-07,
      "loss": 0.0001,
      "num_tokens": 75674504.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2757,
      "step_time": 13.419739801436663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 122.75,
      "completions/mean_terminated_length": 122.75,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.3256135880947113,
      "epoch": 0.12774432607688746,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016883771168068051,
      "kl": 0.0018302177195437253,
      "learning_rate": 9.744603983325613e-07,
      "loss": 0.0001,
      "num_tokens": 75699268.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2758,
      "step_time": 14.83268203213811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 137.75,
      "completions/mean_terminated_length": 137.75,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2979929596185684,
      "epoch": 0.12779064381658176,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001972433878108859,
      "kl": 0.001491810631705448,
      "learning_rate": 9.744511347846224e-07,
      "loss": 0.0001,
      "num_tokens": 75727760.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2759,
      "step_time": 16.303051222115755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 161.625,
      "completions/mean_terminated_length": 161.625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.36969175189733505,
      "epoch": 0.12783696155627605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004510269034653902,
      "kl": 0.003775397897697985,
      "learning_rate": 9.744418712366836e-07,
      "loss": 0.0002,
      "num_tokens": 75750378.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2760,
      "step_time": 18.004489295184612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 186.8125,
      "completions/mean_terminated_length": 186.8125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.32959359884262085,
      "epoch": 0.12788327929597035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11003851890563965,
      "kl": 0.006389186019077897,
      "learning_rate": 9.744326076887447e-07,
      "loss": -0.0182,
      "num_tokens": 75773303.0,
      "reward": 0.8806997537612915,
      "reward_std": 0.23485326766967773,
      "rewards/reward_func/mean": 0.8806997537612915,
      "rewards/reward_func/std": 0.23485328257083893,
      "step": 2761,
      "step_time": 21.36713733896613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 164.8125,
      "completions/mean_terminated_length": 164.8125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.2286955863237381,
      "epoch": 0.12792959703566467,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1614483892917633,
      "kl": 0.012991887633688748,
      "learning_rate": 9.744233441408058e-07,
      "loss": -0.0101,
      "num_tokens": 75794740.0,
      "reward": 0.9262244701385498,
      "reward_std": 0.15861256420612335,
      "rewards/reward_func/mean": 0.9262244701385498,
      "rewards/reward_func/std": 0.15861256420612335,
      "step": 2762,
      "step_time": 16.983039274811745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 124.3125,
      "completions/mean_terminated_length": 124.3125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2735665291547775,
      "epoch": 0.12797591477535897,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002134870272129774,
      "kl": 0.0016807553183753043,
      "learning_rate": 9.74414080592867e-07,
      "loss": 0.0001,
      "num_tokens": 75815449.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2763,
      "step_time": 13.364386174827814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 228.25,
      "completions/mean_terminated_length": 228.25,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.24697228893637657,
      "epoch": 0.12802223251505326,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022184564732015133,
      "kl": 0.0017724183562677354,
      "learning_rate": 9.744048170449283e-07,
      "loss": 0.0001,
      "num_tokens": 75840365.0,
      "reward": 0.9648571610450745,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9648571610450745,
      "rewards/reward_func/std": 0.0,
      "step": 2764,
      "step_time": 22.50494721531868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 127.3125,
      "completions/mean_terminated_length": 127.3125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2863641083240509,
      "epoch": 0.12806855025474756,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002310654614120722,
      "kl": 0.0019112858863081783,
      "learning_rate": 9.743955534969894e-07,
      "loss": 0.0001,
      "num_tokens": 75863650.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2765,
      "step_time": 14.678524300456047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 227.875,
      "completions/mean_terminated_length": 227.875,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "entropy": 0.19780412316322327,
      "epoch": 0.12811486799444188,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004414843861013651,
      "kl": 0.0038772448897361755,
      "learning_rate": 9.743862899490505e-07,
      "loss": 0.0002,
      "num_tokens": 75889392.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2766,
      "step_time": 21.383998308330774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 126.875,
      "completions/mean_terminated_length": 126.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2553219012916088,
      "epoch": 0.12816118573413618,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018279865616932511,
      "kl": 0.0014974544756114483,
      "learning_rate": 9.743770264011116e-07,
      "loss": 0.0001,
      "num_tokens": 75908750.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2767,
      "step_time": 14.12143798545003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 180.9375,
      "completions/mean_terminated_length": 180.9375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.2772565595805645,
      "epoch": 0.12820750347383048,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09665929526090622,
      "kl": 0.010590121382847428,
      "learning_rate": 9.743677628531728e-07,
      "loss": -0.0762,
      "num_tokens": 75929837.0,
      "reward": 0.548355758190155,
      "reward_std": 0.4709661900997162,
      "rewards/reward_func/mean": 0.548355758190155,
      "rewards/reward_func/std": 0.4709661900997162,
      "step": 2768,
      "step_time": 21.317594226449728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 121.0,
      "completions/max_terminated_length": 121.0,
      "completions/mean_length": 106.3125,
      "completions/mean_terminated_length": 106.3125,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.2962646558880806,
      "epoch": 0.12825382121352477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0066306861117482185,
      "kl": 0.0021396586671471596,
      "learning_rate": 9.743584993052339e-07,
      "loss": 0.0001,
      "num_tokens": 75951442.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2769,
      "step_time": 12.336755074560642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 180.5625,
      "completions/mean_terminated_length": 180.5625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3718944415450096,
      "epoch": 0.1283001389532191,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036362269893288612,
      "kl": 0.002736643247772008,
      "learning_rate": 9.74349235757295e-07,
      "loss": 0.0001,
      "num_tokens": 75974187.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2770,
      "step_time": 19.46478195488453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 169.4375,
      "completions/mean_terminated_length": 169.4375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.39902693778276443,
      "epoch": 0.1283464566929134,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00557924248278141,
      "kl": 0.0039357165223918855,
      "learning_rate": 9.743399722093561e-07,
      "loss": 0.0002,
      "num_tokens": 76011122.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2771,
      "step_time": 20.970091186463833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 144.25,
      "completions/mean_terminated_length": 144.25,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.21097231283783913,
      "epoch": 0.1283927744326077,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12273252010345459,
      "kl": 0.0033145806519314647,
      "learning_rate": 9.743307086614173e-07,
      "loss": -0.0722,
      "num_tokens": 76036102.0,
      "reward": 0.11289691925048828,
      "reward_std": 0.23656082153320312,
      "rewards/reward_func/mean": 0.11289691925048828,
      "rewards/reward_func/std": 0.23656083643436432,
      "step": 2772,
      "step_time": 17.545169688761234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 181.5625,
      "completions/mean_terminated_length": 181.5625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.24039329588413239,
      "epoch": 0.12843909217230198,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004032476805150509,
      "kl": 0.0030674479785375297,
      "learning_rate": 9.743214451134784e-07,
      "loss": 0.0002,
      "num_tokens": 76057743.0,
      "reward": 0.31760504841804504,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.31760504841804504,
      "rewards/reward_func/std": 0.0,
      "step": 2773,
      "step_time": 18.22195716202259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 172.625,
      "completions/mean_terminated_length": 172.625,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.37091164290905,
      "epoch": 0.1284854099119963,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025031070690602064,
      "kl": 0.002361326478421688,
      "learning_rate": 9.743121815655395e-07,
      "loss": 0.0001,
      "num_tokens": 76085465.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2774,
      "step_time": 20.650371208786964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 165.25,
      "completions/mean_terminated_length": 165.25,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4116973876953125,
      "epoch": 0.1285317276516906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002033612225204706,
      "kl": 0.002104911662172526,
      "learning_rate": 9.743029180176006e-07,
      "loss": 0.0001,
      "num_tokens": 76130317.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2775,
      "step_time": 22.69060404598713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 178.3125,
      "completions/mean_terminated_length": 178.3125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.321686789393425,
      "epoch": 0.1285780453913849,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12945276498794556,
      "kl": 0.006211111904121935,
      "learning_rate": 9.742936544696618e-07,
      "loss": 0.0791,
      "num_tokens": 76151330.0,
      "reward": 0.008539922535419464,
      "reward_std": 0.0022773125674575567,
      "rewards/reward_func/mean": 0.008539922535419464,
      "rewards/reward_func/std": 0.0022773125674575567,
      "step": 2776,
      "step_time": 20.260665953159332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 117.5625,
      "completions/mean_terminated_length": 117.5625,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.29922936111688614,
      "epoch": 0.1286243631310792,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006918784696608782,
      "kl": 0.0030423023272305727,
      "learning_rate": 9.742843909217229e-07,
      "loss": 0.0001,
      "num_tokens": 76175003.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2777,
      "step_time": 14.081242229789495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 135.375,
      "completions/mean_terminated_length": 135.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.32472098618745804,
      "epoch": 0.12867068087077352,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003332186955958605,
      "kl": 0.0022369700018316507,
      "learning_rate": 9.742751273737842e-07,
      "loss": 0.0001,
      "num_tokens": 76199105.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2778,
      "step_time": 15.179923698306084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 233.0,
      "completions/mean_terminated_length": 233.0,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.3663220778107643,
      "epoch": 0.12871699861046781,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01513900700956583,
      "kl": 0.011680921306833625,
      "learning_rate": 9.742658638258453e-07,
      "loss": 0.0006,
      "num_tokens": 76226449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2779,
      "step_time": 24.165780186653137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 219.75,
      "completions/mean_terminated_length": 219.75,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.27411989122629166,
      "epoch": 0.1287633163501621,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10122770816087723,
      "kl": 0.006216954789124429,
      "learning_rate": 9.742566002779063e-07,
      "loss": -0.0375,
      "num_tokens": 76264061.0,
      "reward": 0.34300726652145386,
      "reward_std": 0.2744058072566986,
      "rewards/reward_func/mean": 0.34300726652145386,
      "rewards/reward_func/std": 0.2744058072566986,
      "step": 2780,
      "step_time": 24.751993499696255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 150.125,
      "completions/mean_terminated_length": 150.125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.1366775520145893,
      "epoch": 0.1288096340898564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009560997714288533,
      "kl": 0.0007964491378515959,
      "learning_rate": 9.742473367299676e-07,
      "loss": 0.0,
      "num_tokens": 76285807.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2781,
      "step_time": 17.288374941796064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 122.3125,
      "completions/mean_terminated_length": 122.3125,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2840088978409767,
      "epoch": 0.12885595182955073,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002545348834246397,
      "kl": 0.0017528543248772621,
      "learning_rate": 9.742380731820287e-07,
      "loss": 0.0001,
      "num_tokens": 76306932.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2782,
      "step_time": 13.426001753658056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 119.5,
      "completions/mean_terminated_length": 119.5,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.27867213636636734,
      "epoch": 0.12890226956924503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002323121763765812,
      "kl": 0.00176383025245741,
      "learning_rate": 9.742288096340898e-07,
      "loss": 0.0001,
      "num_tokens": 76331020.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2783,
      "step_time": 14.70707942545414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 168.75,
      "completions/mean_terminated_length": 168.75,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.19457445666193962,
      "epoch": 0.12894858730893932,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11523640155792236,
      "kl": 0.00239378004334867,
      "learning_rate": 9.74219546086151e-07,
      "loss": -0.0114,
      "num_tokens": 76354584.0,
      "reward": 0.9422565698623657,
      "reward_std": 0.23097383975982666,
      "rewards/reward_func/mean": 0.9422565698623657,
      "rewards/reward_func/std": 0.23097383975982666,
      "step": 2784,
      "step_time": 18.583884596824646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 134.4375,
      "completions/mean_terminated_length": 134.4375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.26692158728837967,
      "epoch": 0.12899490504863362,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010094476863741875,
      "kl": 0.0031349931086879224,
      "learning_rate": 9.74210282538212e-07,
      "loss": 0.0002,
      "num_tokens": 76381455.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2785,
      "step_time": 16.40631280466914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 139.0625,
      "completions/mean_terminated_length": 139.0625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2784252278506756,
      "epoch": 0.12904122278832794,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19294673204421997,
      "kl": 0.004295072401873767,
      "learning_rate": 9.742010189902732e-07,
      "loss": 0.0017,
      "num_tokens": 76416352.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2786,
      "step_time": 19.347070194780827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 146.9375,
      "completions/mean_terminated_length": 146.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.38738276809453964,
      "epoch": 0.12908754052802224,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003739331616088748,
      "kl": 0.0035139237297698855,
      "learning_rate": 9.741917554423343e-07,
      "loss": 0.0002,
      "num_tokens": 76448223.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2787,
      "step_time": 18.148137751966715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 159.4375,
      "completions/mean_terminated_length": 159.4375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.19509608671069145,
      "epoch": 0.12913385826771653,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003527343040332198,
      "kl": 0.001734732068143785,
      "learning_rate": 9.741824918943955e-07,
      "loss": 0.0001,
      "num_tokens": 76470230.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2788,
      "step_time": 15.995738919824362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 167.375,
      "completions/mean_terminated_length": 167.375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.343943752348423,
      "epoch": 0.12918017600741083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020726339425891638,
      "kl": 0.001907685917103663,
      "learning_rate": 9.741732283464566e-07,
      "loss": 0.0001,
      "num_tokens": 76492156.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2789,
      "step_time": 18.224940542131662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 175.0625,
      "completions/mean_terminated_length": 175.0625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4176969677209854,
      "epoch": 0.12922649374710515,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011609376408159733,
      "kl": 0.00793652969878167,
      "learning_rate": 9.741639647985177e-07,
      "loss": 0.0004,
      "num_tokens": 76516781.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2790,
      "step_time": 20.1127092204988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.36189454793930054,
      "epoch": 0.12927281148679945,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036154116969555616,
      "kl": 0.003362747753271833,
      "learning_rate": 9.74154701250579e-07,
      "loss": 0.0002,
      "num_tokens": 76548994.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2791,
      "step_time": 19.28479090332985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 189.1875,
      "completions/mean_terminated_length": 189.1875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.4514002129435539,
      "epoch": 0.12931912922649375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006092879921197891,
      "kl": 0.004910267889499664,
      "learning_rate": 9.7414543770264e-07,
      "loss": 0.0002,
      "num_tokens": 76594005.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2792,
      "step_time": 27.47137390822172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 144.625,
      "completions/mean_terminated_length": 144.625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.35142024606466293,
      "epoch": 0.12936544696618804,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029565657023340464,
      "kl": 0.002456948481267318,
      "learning_rate": 9.74136174154701e-07,
      "loss": 0.0001,
      "num_tokens": 76617183.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2793,
      "step_time": 17.76403970271349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 166.0625,
      "completions/mean_terminated_length": 166.0625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.290935218334198,
      "epoch": 0.12941176470588237,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005273391027003527,
      "kl": 0.0038551719044335186,
      "learning_rate": 9.741269106067624e-07,
      "loss": 0.0002,
      "num_tokens": 76642080.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2794,
      "step_time": 18.212548714131117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 146.5625,
      "completions/mean_terminated_length": 146.5625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3950282782316208,
      "epoch": 0.12945808244557666,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018907836638391018,
      "kl": 0.0021647471003234386,
      "learning_rate": 9.741176470588236e-07,
      "loss": 0.0001,
      "num_tokens": 76671529.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2795,
      "step_time": 18.63210655003786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 111.5,
      "completions/mean_terminated_length": 111.5,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.23798342049121857,
      "epoch": 0.12950440018527096,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018192932475358248,
      "kl": 0.0012378752580843866,
      "learning_rate": 9.741083835108847e-07,
      "loss": 0.0001,
      "num_tokens": 76691969.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2796,
      "step_time": 13.495010670274496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 135.5,
      "completions/mean_terminated_length": 135.5,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.20825041830539703,
      "epoch": 0.12955071792496525,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002133691916242242,
      "kl": 0.0016764855245128274,
      "learning_rate": 9.740991199629458e-07,
      "loss": 0.0001,
      "num_tokens": 76712009.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2797,
      "step_time": 15.695982314646244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 119.0,
      "completions/max_terminated_length": 119.0,
      "completions/mean_length": 109.375,
      "completions/mean_terminated_length": 109.375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.22557823359966278,
      "epoch": 0.12959703566465958,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020180430728942156,
      "kl": 0.0015038259298307821,
      "learning_rate": 9.74089856415007e-07,
      "loss": 0.0001,
      "num_tokens": 76731167.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2798,
      "step_time": 11.681996446102858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 143.0625,
      "completions/mean_terminated_length": 143.0625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.14772458374500275,
      "epoch": 0.12964335340435387,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011993461521342397,
      "kl": 0.0010793707915581763,
      "learning_rate": 9.74080592867068e-07,
      "loss": 0.0001,
      "num_tokens": 76777584.0,
      "reward": 0.796358048915863,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.796358048915863,
      "rewards/reward_func/std": 0.0,
      "step": 2799,
      "step_time": 22.49989054724574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 121.6875,
      "completions/mean_terminated_length": 121.6875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2562808692455292,
      "epoch": 0.12968967114404817,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010770438238978386,
      "kl": 0.0009614722366677597,
      "learning_rate": 9.740713293191292e-07,
      "loss": 0.0,
      "num_tokens": 76806011.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2800,
      "step_time": 14.480131164193153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 133.625,
      "completions/mean_terminated_length": 133.625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3112131878733635,
      "epoch": 0.12973598888374246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002840921049937606,
      "kl": 0.0020887310383841395,
      "learning_rate": 9.740620657711903e-07,
      "loss": 0.0001,
      "num_tokens": 76830293.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2801,
      "step_time": 15.810532130300999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 127.5,
      "completions/mean_terminated_length": 127.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.3131905794143677,
      "epoch": 0.1297823066234368,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026322880294173956,
      "kl": 0.0021417181997094303,
      "learning_rate": 9.740528022232514e-07,
      "loss": 0.0001,
      "num_tokens": 76854029.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2802,
      "step_time": 14.700319416821003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 159.5,
      "completions/mean_terminated_length": 159.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.19162605702877045,
      "epoch": 0.12982862436313108,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009328178130090237,
      "kl": 0.005718842032365501,
      "learning_rate": 9.740435386753126e-07,
      "loss": 0.0003,
      "num_tokens": 76884373.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2803,
      "step_time": 19.34447018429637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 192.375,
      "completions/mean_terminated_length": 192.375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.3919840082526207,
      "epoch": 0.12987494210282538,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10269218683242798,
      "kl": 0.005773777607828379,
      "learning_rate": 9.740342751273739e-07,
      "loss": -0.1533,
      "num_tokens": 76909995.0,
      "reward": 0.10943973064422607,
      "reward_std": 0.1957717388868332,
      "rewards/reward_func/mean": 0.10943973064422607,
      "rewards/reward_func/std": 0.1957717388868332,
      "step": 2804,
      "step_time": 22.360922671854496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 128.4375,
      "completions/mean_terminated_length": 128.4375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.27369359880685806,
      "epoch": 0.12992125984251968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034899504389613867,
      "kl": 0.002289213822223246,
      "learning_rate": 9.740250115794348e-07,
      "loss": 0.0001,
      "num_tokens": 76930626.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2805,
      "step_time": 14.012876734137535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 144.625,
      "completions/mean_terminated_length": 144.625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.35228947550058365,
      "epoch": 0.129967577582214,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16344957053661346,
      "kl": 0.006582444650121033,
      "learning_rate": 9.74015748031496e-07,
      "loss": -0.104,
      "num_tokens": 76954972.0,
      "reward": 0.04720311611890793,
      "reward_std": 0.1888124644756317,
      "rewards/reward_func/mean": 0.04720311611890793,
      "rewards/reward_func/std": 0.1888124793767929,
      "step": 2806,
      "step_time": 18.877719160169363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 191.9375,
      "completions/mean_terminated_length": 191.9375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.39682287722826004,
      "epoch": 0.1300138953219083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019119007047265768,
      "kl": 0.0019481416675262153,
      "learning_rate": 9.74006484483557e-07,
      "loss": 0.0001,
      "num_tokens": 76985995.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2807,
      "step_time": 21.04789998009801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 224.5625,
      "completions/mean_terminated_length": 224.5625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.4276200234889984,
      "epoch": 0.1300602130616026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11104413121938705,
      "kl": 0.009202176705002785,
      "learning_rate": 9.739972209356184e-07,
      "loss": -0.117,
      "num_tokens": 77018948.0,
      "reward": 0.0025933170691132545,
      "reward_std": 0.003972659353166819,
      "rewards/reward_func/mean": 0.0025933170691132545,
      "rewards/reward_func/std": 0.003972659818828106,
      "step": 2808,
      "step_time": 28.487824976444244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 175.0,
      "completions/mean_terminated_length": 175.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.40838924795389175,
      "epoch": 0.1301065308012969,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014654034748673439,
      "kl": 0.006727033876813948,
      "learning_rate": 9.739879573876795e-07,
      "loss": 0.0003,
      "num_tokens": 77048420.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2809,
      "step_time": 21.227949671447277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 199.25,
      "completions/mean_terminated_length": 199.25,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.4511312171816826,
      "epoch": 0.1301528485409912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002993133617565036,
      "kl": 0.0028400610899552703,
      "learning_rate": 9.739786938397406e-07,
      "loss": 0.0001,
      "num_tokens": 77074344.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2810,
      "step_time": 22.26493389904499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 194.4375,
      "completions/mean_terminated_length": 194.4375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2299153245985508,
      "epoch": 0.1301991662806855,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1469038426876068,
      "kl": 0.012816822156310081,
      "learning_rate": 9.739694302918018e-07,
      "loss": 0.0382,
      "num_tokens": 77107743.0,
      "reward": 0.48003360629081726,
      "reward_std": 0.23582106828689575,
      "rewards/reward_func/mean": 0.48003360629081726,
      "rewards/reward_func/std": 0.23582108318805695,
      "step": 2811,
      "step_time": 25.016322780400515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 118.8125,
      "completions/mean_terminated_length": 118.8125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2408006638288498,
      "epoch": 0.1302454840203798,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00309711042791605,
      "kl": 0.0018486992630641907,
      "learning_rate": 9.739601667438629e-07,
      "loss": 0.0001,
      "num_tokens": 77127020.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2812,
      "step_time": 13.587357494980097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 176.0625,
      "completions/mean_terminated_length": 176.0625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.20126453414559364,
      "epoch": 0.1302918017600741,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12710730731487274,
      "kl": 0.00843966007232666,
      "learning_rate": 9.73950903195924e-07,
      "loss": -0.0621,
      "num_tokens": 77150157.0,
      "reward": 0.6890777349472046,
      "reward_std": 0.19096322357654572,
      "rewards/reward_func/mean": 0.6890777349472046,
      "rewards/reward_func/std": 0.19096322357654572,
      "step": 2813,
      "step_time": 19.778738137334585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 224.4375,
      "completions/mean_terminated_length": 224.4375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.48743831366300583,
      "epoch": 0.13033811949976842,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027950266376137733,
      "kl": 0.0023856922634877264,
      "learning_rate": 9.739416396479851e-07,
      "loss": 0.0001,
      "num_tokens": 77184212.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2814,
      "step_time": 25.894040428102016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 178.5625,
      "completions/mean_terminated_length": 178.5625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.25779133290052414,
      "epoch": 0.13038443723946272,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08911740034818649,
      "kl": 0.0016971510485745966,
      "learning_rate": 9.739323761000463e-07,
      "loss": -0.0071,
      "num_tokens": 77219549.0,
      "reward": 0.8511500954627991,
      "reward_std": 0.0016676903469488025,
      "rewards/reward_func/mean": 0.8511500954627991,
      "rewards/reward_func/std": 0.0016676932573318481,
      "step": 2815,
      "step_time": 22.849760118871927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 154.0,
      "completions/mean_terminated_length": 154.0,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2913820743560791,
      "epoch": 0.13043075497915702,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025024309288710356,
      "kl": 0.0018942720780614763,
      "learning_rate": 9.739231125521074e-07,
      "loss": 0.0001,
      "num_tokens": 77249405.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2816,
      "step_time": 18.067398082464933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 417.0,
      "completions/max_terminated_length": 417.0,
      "completions/mean_length": 298.3125,
      "completions/mean_terminated_length": 298.3125,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "entropy": 0.35839635133743286,
      "epoch": 0.1304770727188513,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07766842842102051,
      "kl": 0.004964290070347488,
      "learning_rate": 9.739138490041685e-07,
      "loss": -0.1697,
      "num_tokens": 77289698.0,
      "reward": 0.23419679701328278,
      "reward_std": 0.35474202036857605,
      "rewards/reward_func/mean": 0.23419679701328278,
      "rewards/reward_func/std": 0.35474202036857605,
      "step": 2817,
      "step_time": 38.30240035802126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 132.25,
      "completions/mean_terminated_length": 132.25,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.3360302150249481,
      "epoch": 0.13052339045854564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030505871400237083,
      "kl": 0.002078228397294879,
      "learning_rate": 9.739045854562296e-07,
      "loss": 0.0001,
      "num_tokens": 77325526.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2818,
      "step_time": 19.013024352490902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 193.4375,
      "completions/mean_terminated_length": 193.4375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.3325582519173622,
      "epoch": 0.13056970819823993,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017353898147121072,
      "kl": 0.001587057369761169,
      "learning_rate": 9.738953219082908e-07,
      "loss": 0.0001,
      "num_tokens": 77352685.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2819,
      "step_time": 20.33905030414462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 215.375,
      "completions/mean_terminated_length": 215.375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.2956802621483803,
      "epoch": 0.13061602593793423,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09080685675144196,
      "kl": 0.005979874520562589,
      "learning_rate": 9.738860583603519e-07,
      "loss": 0.0197,
      "num_tokens": 77376211.0,
      "reward": 0.1551075428724289,
      "reward_std": 0.036326441913843155,
      "rewards/reward_func/mean": 0.1551075428724289,
      "rewards/reward_func/std": 0.03632644936442375,
      "step": 2820,
      "step_time": 21.90963600203395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 150.9375,
      "completions/mean_terminated_length": 150.9375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.38743866980075836,
      "epoch": 0.13066234367762852,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015383700374513865,
      "kl": 0.0016945595853030682,
      "learning_rate": 9.738767948124132e-07,
      "loss": 0.0001,
      "num_tokens": 77419986.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2821,
      "step_time": 21.644740346819162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 162.8125,
      "completions/mean_terminated_length": 162.8125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.26039203628897667,
      "epoch": 0.13070866141732285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11434976756572723,
      "kl": 0.0033649375254753977,
      "learning_rate": 9.738675312644744e-07,
      "loss": -0.0345,
      "num_tokens": 77441775.0,
      "reward": 0.9293943643569946,
      "reward_std": 0.035030219703912735,
      "rewards/reward_func/mean": 0.9293943643569946,
      "rewards/reward_func/std": 0.03503022342920303,
      "step": 2822,
      "step_time": 18.52336959913373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 128.4375,
      "completions/mean_terminated_length": 128.4375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3027849420905113,
      "epoch": 0.13075497915701714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005467102862894535,
      "kl": 0.0026550963521003723,
      "learning_rate": 9.738582677165353e-07,
      "loss": 0.0001,
      "num_tokens": 77465526.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2823,
      "step_time": 15.159585032612085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 169.4375,
      "completions/mean_terminated_length": 169.4375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.25463734567165375,
      "epoch": 0.13080129689671144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048914761282503605,
      "kl": 0.002183649019571021,
      "learning_rate": 9.738490041685966e-07,
      "loss": 0.0001,
      "num_tokens": 77486445.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2824,
      "step_time": 17.311506897211075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 199.0,
      "completions/mean_terminated_length": 199.0,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3090377524495125,
      "epoch": 0.13084761463640573,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23248831927776337,
      "kl": 0.005472452496178448,
      "learning_rate": 9.738397406206577e-07,
      "loss": -0.019,
      "num_tokens": 77512237.0,
      "reward": 0.4834849238395691,
      "reward_std": 0.4851493239402771,
      "rewards/reward_func/mean": 0.4834849238395691,
      "rewards/reward_func/std": 0.4851492941379547,
      "step": 2825,
      "step_time": 21.344312380999327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 116.0625,
      "completions/mean_terminated_length": 116.0625,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2596679925918579,
      "epoch": 0.13089393237610006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020094362553209066,
      "kl": 0.001851470791734755,
      "learning_rate": 9.738304770727189e-07,
      "loss": 0.0001,
      "num_tokens": 77533870.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2826,
      "step_time": 13.552637655287981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 210.5625,
      "completions/mean_terminated_length": 210.5625,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.1898454986512661,
      "epoch": 0.13094025011579435,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038989358581602573,
      "kl": 0.002383921411819756,
      "learning_rate": 9.7382121352478e-07,
      "loss": 0.0001,
      "num_tokens": 77571959.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2827,
      "step_time": 23.55286794155836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 186.125,
      "completions/mean_terminated_length": 186.125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.17277348786592484,
      "epoch": 0.13098656785548865,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012403902364894748,
      "kl": 0.0008625778427813202,
      "learning_rate": 9.73811949976841e-07,
      "loss": 0.0,
      "num_tokens": 77604505.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 2828,
      "step_time": 21.842342231422663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 193.0,
      "completions/mean_terminated_length": 193.0,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.36300013214349747,
      "epoch": 0.13103288559518295,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005545167252421379,
      "kl": 0.004219019669108093,
      "learning_rate": 9.738026864289022e-07,
      "loss": 0.0002,
      "num_tokens": 77634169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2829,
      "step_time": 21.325227595865726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 143.3125,
      "completions/mean_terminated_length": 143.3125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3187604248523712,
      "epoch": 0.13107920333487727,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027888871263712645,
      "kl": 0.0021906490437686443,
      "learning_rate": 9.737934228809634e-07,
      "loss": 0.0001,
      "num_tokens": 77659806.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2830,
      "step_time": 16.002289425581694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 152.5,
      "completions/mean_terminated_length": 152.5,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.428384892642498,
      "epoch": 0.13112552107457157,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022219247184693813,
      "kl": 0.0027715860633179545,
      "learning_rate": 9.737841593330245e-07,
      "loss": 0.0001,
      "num_tokens": 77705062.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2831,
      "step_time": 22.706428229808807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 137.875,
      "completions/mean_terminated_length": 137.875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.332115963101387,
      "epoch": 0.13117183881426586,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004188732244074345,
      "kl": 0.002412432979326695,
      "learning_rate": 9.737748957850856e-07,
      "loss": 0.0001,
      "num_tokens": 77732244.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2832,
      "step_time": 17.01812907680869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 147.6875,
      "completions/mean_terminated_length": 147.6875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3853449895977974,
      "epoch": 0.13121815655396016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004720553755760193,
      "kl": 0.003017849929165095,
      "learning_rate": 9.737656322371467e-07,
      "loss": 0.0002,
      "num_tokens": 77763391.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2833,
      "step_time": 17.235878136008978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 127.4375,
      "completions/mean_terminated_length": 127.4375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.31607072055339813,
      "epoch": 0.13126447429365448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0046013761311769485,
      "kl": 0.002206444158218801,
      "learning_rate": 9.73756368689208e-07,
      "loss": 0.0001,
      "num_tokens": 77791814.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2834,
      "step_time": 15.14806305989623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 128.0,
      "completions/mean_terminated_length": 128.0,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.29682984203100204,
      "epoch": 0.13131079203334878,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023793757427483797,
      "kl": 0.001989031967241317,
      "learning_rate": 9.73747105141269e-07,
      "loss": 0.0001,
      "num_tokens": 77812534.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2835,
      "step_time": 14.895387556403875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 148.25,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3648112565279007,
      "epoch": 0.13135710977304307,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023736937437206507,
      "kl": 0.002105565945385024,
      "learning_rate": 9.7373784159333e-07,
      "loss": 0.0001,
      "num_tokens": 77838938.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2836,
      "step_time": 17.144540406763554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 175.875,
      "completions/mean_terminated_length": 175.875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.37220189720392227,
      "epoch": 0.13140342751273737,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03227676451206207,
      "kl": 0.011503327172249556,
      "learning_rate": 9.737285780453912e-07,
      "loss": 0.0039,
      "num_tokens": 77861016.0,
      "reward": 2.237048465758562e-05,
      "reward_std": 2.982731348311063e-05,
      "rewards/reward_func/mean": 2.237048465758562e-05,
      "rewards/reward_func/std": 2.9827315302100033e-05,
      "step": 2837,
      "step_time": 19.155315332114697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 144.625,
      "completions/mean_terminated_length": 144.625,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.3582801818847656,
      "epoch": 0.1314497452524317,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003145157126709819,
      "kl": 0.002566359471529722,
      "learning_rate": 9.737193144974526e-07,
      "loss": 0.0001,
      "num_tokens": 77884482.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2838,
      "step_time": 16.08520222082734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 171.75,
      "completions/mean_terminated_length": 171.75,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.37275414168834686,
      "epoch": 0.131496062992126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0041363718919456005,
      "kl": 0.002808906661812216,
      "learning_rate": 9.737100509495137e-07,
      "loss": 0.0001,
      "num_tokens": 77908030.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2839,
      "step_time": 18.000150248408318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 205.0625,
      "completions/mean_terminated_length": 205.0625,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.2423257678747177,
      "epoch": 0.13154238073182029,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0757722333073616,
      "kl": 0.003921059018466622,
      "learning_rate": 9.737007874015748e-07,
      "loss": -0.0097,
      "num_tokens": 77938927.0,
      "reward": 0.6516474485397339,
      "reward_std": 0.012865008786320686,
      "rewards/reward_func/mean": 0.6516474485397339,
      "rewards/reward_func/std": 0.012865009717643261,
      "step": 2840,
      "step_time": 24.081785764545202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 443.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 396.375,
      "completions/mean_terminated_length": 396.375,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "entropy": 0.2218848541378975,
      "epoch": 0.13158869847151458,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06507100909948349,
      "kl": 0.003597293747588992,
      "learning_rate": 9.73691523853636e-07,
      "loss": -0.0362,
      "num_tokens": 77971877.0,
      "reward": 0.939376711845398,
      "reward_std": 0.09530484676361084,
      "rewards/reward_func/mean": 0.939376711845398,
      "rewards/reward_func/std": 0.09530485421419144,
      "step": 2841,
      "step_time": 37.56795885413885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 170.5625,
      "completions/mean_terminated_length": 170.5625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.2796224169433117,
      "epoch": 0.1316350162112089,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11485463380813599,
      "kl": 0.006121489219367504,
      "learning_rate": 9.73682260305697e-07,
      "loss": -0.0539,
      "num_tokens": 77993054.0,
      "reward": 0.9737098217010498,
      "reward_std": 0.0470292828977108,
      "rewards/reward_func/mean": 0.9737098217010498,
      "rewards/reward_func/std": 0.0470292754471302,
      "step": 2842,
      "step_time": 18.752591751515865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 110.5625,
      "completions/mean_terminated_length": 110.5625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2667034789919853,
      "epoch": 0.1316813339509032,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004681343678385019,
      "kl": 0.0029791942215524614,
      "learning_rate": 9.736729967577582e-07,
      "loss": 0.0001,
      "num_tokens": 78013623.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2843,
      "step_time": 13.74761088192463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 168.5,
      "completions/mean_terminated_length": 168.5,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3632110729813576,
      "epoch": 0.1317276516905975,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005398222245275974,
      "kl": 0.003484962333459407,
      "learning_rate": 9.736637332098193e-07,
      "loss": 0.0002,
      "num_tokens": 78061759.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2844,
      "step_time": 23.896695479750633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 159.75,
      "completions/mean_terminated_length": 159.75,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.45029694586992264,
      "epoch": 0.1317739694302918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002244369825348258,
      "kl": 0.0021674801246263087,
      "learning_rate": 9.736544696618804e-07,
      "loss": 0.0001,
      "num_tokens": 78094635.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2845,
      "step_time": 19.787635374814272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 136.125,
      "completions/mean_terminated_length": 136.125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2084420546889305,
      "epoch": 0.13182028716998612,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002307872287929058,
      "kl": 0.0015559589373879135,
      "learning_rate": 9.736452061139416e-07,
      "loss": 0.0001,
      "num_tokens": 78114221.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2846,
      "step_time": 13.526204243302345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 264.25,
      "completions/mean_terminated_length": 264.25,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.25473516434431076,
      "epoch": 0.1318666049096804,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11678625643253326,
      "kl": 0.005555804818868637,
      "learning_rate": 9.736359425660027e-07,
      "loss": -0.0889,
      "num_tokens": 78146945.0,
      "reward": 0.9171520471572876,
      "reward_std": 0.2485218346118927,
      "rewards/reward_func/mean": 0.9171520471572876,
      "rewards/reward_func/std": 0.2485218197107315,
      "step": 2847,
      "step_time": 28.150619812309742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 140.5625,
      "completions/mean_terminated_length": 140.5625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2401268556714058,
      "epoch": 0.1319129226493747,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015420548152178526,
      "kl": 0.0012628434924408793,
      "learning_rate": 9.736266790180638e-07,
      "loss": 0.0001,
      "num_tokens": 78166586.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2848,
      "step_time": 15.192064005881548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 154.5,
      "completions/mean_terminated_length": 154.5,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.1445140577852726,
      "epoch": 0.131959240389069,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025332460645586252,
      "kl": 0.0010443386563565582,
      "learning_rate": 9.73617415470125e-07,
      "loss": 0.0001,
      "num_tokens": 78204498.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 2849,
      "step_time": 19.768490180373192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4105469658970833,
      "epoch": 0.13200555812876333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004277004394680262,
      "kl": 0.0022296886891126633,
      "learning_rate": 9.73608151922186e-07,
      "loss": 0.0001,
      "num_tokens": 78238307.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2850,
      "step_time": 21.153131306171417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 163.6875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3839899152517319,
      "epoch": 0.13205187586845762,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011443643597885966,
      "kl": 0.001415493810782209,
      "learning_rate": 9.735988883742474e-07,
      "loss": 0.0001,
      "num_tokens": 78295438.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2851,
      "step_time": 27.310379087924957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 152.9375,
      "completions/mean_terminated_length": 152.9375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.33014437556266785,
      "epoch": 0.13209819360815192,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010165599174797535,
      "kl": 0.00636251294054091,
      "learning_rate": 9.735896248263085e-07,
      "loss": 0.0003,
      "num_tokens": 78316093.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2852,
      "step_time": 17.826702434569597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 123.875,
      "completions/mean_terminated_length": 123.875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.310026116669178,
      "epoch": 0.13214451134784622,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032589833717793226,
      "kl": 0.0018544544582255185,
      "learning_rate": 9.735803612783696e-07,
      "loss": 0.0001,
      "num_tokens": 78351867.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2853,
      "step_time": 17.5997002273798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 316.0,
      "completions/max_terminated_length": 316.0,
      "completions/mean_length": 283.125,
      "completions/mean_terminated_length": 283.125,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "entropy": 0.19570598751306534,
      "epoch": 0.13219082908754054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010641536209732294,
      "kl": 0.0010448120883665979,
      "learning_rate": 9.735710977304308e-07,
      "loss": 0.0001,
      "num_tokens": 78393581.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2854,
      "step_time": 30.403636183589697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 138.4375,
      "completions/mean_terminated_length": 138.4375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.43954967707395554,
      "epoch": 0.13223714682723484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003814935917034745,
      "kl": 0.0027921597356908023,
      "learning_rate": 9.73561834182492e-07,
      "loss": 0.0001,
      "num_tokens": 78416532.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2855,
      "step_time": 15.817701667547226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 119.8125,
      "completions/mean_terminated_length": 119.8125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2750936970114708,
      "epoch": 0.13228346456692913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00558044109493494,
      "kl": 0.0032425089739263058,
      "learning_rate": 9.73552570634553e-07,
      "loss": 0.0002,
      "num_tokens": 78436321.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2856,
      "step_time": 12.657209232449532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 225.375,
      "completions/mean_terminated_length": 225.375,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.26289578527212143,
      "epoch": 0.13232978230662343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006055313628166914,
      "kl": 0.0038973866030573845,
      "learning_rate": 9.735433070866141e-07,
      "loss": 0.0002,
      "num_tokens": 78469447.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2857,
      "step_time": 25.483021415770054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 115.25,
      "completions/mean_terminated_length": 115.25,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.29793746024370193,
      "epoch": 0.13237610004631775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006718107964843512,
      "kl": 0.0022733150981366634,
      "learning_rate": 9.735340435386753e-07,
      "loss": 0.0001,
      "num_tokens": 78492987.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2858,
      "step_time": 14.312612924724817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 201.875,
      "completions/mean_terminated_length": 201.875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.2685972824692726,
      "epoch": 0.13242241778601205,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09218121320009232,
      "kl": 0.0030734979663975537,
      "learning_rate": 9.735247799907364e-07,
      "loss": 0.0674,
      "num_tokens": 78515625.0,
      "reward": 0.2890387773513794,
      "reward_std": 0.010135901160538197,
      "rewards/reward_func/mean": 0.2890387773513794,
      "rewards/reward_func/std": 0.01013590395450592,
      "step": 2859,
      "step_time": 19.88095634058118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 140.6875,
      "completions/mean_terminated_length": 140.6875,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.23037349060177803,
      "epoch": 0.13246873552570634,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024827327579259872,
      "kl": 0.0014823552628513426,
      "learning_rate": 9.735155164427975e-07,
      "loss": 0.0001,
      "num_tokens": 78535444.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2860,
      "step_time": 14.67548917979002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 187.1875,
      "completions/mean_terminated_length": 187.1875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.261336587369442,
      "epoch": 0.13251505326540064,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1303095519542694,
      "kl": 0.01158877625130117,
      "learning_rate": 9.735062528948586e-07,
      "loss": 0.0107,
      "num_tokens": 78557415.0,
      "reward": 0.8026753664016724,
      "reward_std": 0.4001431465148926,
      "rewards/reward_func/mean": 0.8026753664016724,
      "rewards/reward_func/std": 0.4001431465148926,
      "step": 2861,
      "step_time": 20.49484769254923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 152.4375,
      "completions/mean_terminated_length": 152.4375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.20846930146217346,
      "epoch": 0.13256137100509496,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003349486505612731,
      "kl": 0.0025259426329284906,
      "learning_rate": 9.734969893469198e-07,
      "loss": 0.0001,
      "num_tokens": 78579934.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2862,
      "step_time": 15.887763421982527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 123.5,
      "completions/mean_terminated_length": 123.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.34081466495990753,
      "epoch": 0.13260768874478926,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015075166011229157,
      "kl": 0.0018579459574539214,
      "learning_rate": 9.73487725798981e-07,
      "loss": 0.0001,
      "num_tokens": 78599766.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2863,
      "step_time": 13.782018475234509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 156.0,
      "completions/mean_terminated_length": 156.0,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.19136565178632736,
      "epoch": 0.13265400648448356,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1502605527639389,
      "kl": 0.005579829099588096,
      "learning_rate": 9.734784622510422e-07,
      "loss": -0.0105,
      "num_tokens": 78626742.0,
      "reward": 0.8675934076309204,
      "reward_std": 0.02813946083188057,
      "rewards/reward_func/mean": 0.8675934076309204,
      "rewards/reward_func/std": 0.028139453381299973,
      "step": 2864,
      "step_time": 18.511396799236536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.2968406602740288,
      "epoch": 0.13270032422417785,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017504510469734669,
      "kl": 0.0013929185806773603,
      "learning_rate": 9.734691987031034e-07,
      "loss": 0.0001,
      "num_tokens": 78648543.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2865,
      "step_time": 16.743940446525812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 153.125,
      "completions/mean_terminated_length": 153.125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.40339572727680206,
      "epoch": 0.13274664196387218,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022827195934951305,
      "kl": 0.0023920593375805765,
      "learning_rate": 9.734599351551643e-07,
      "loss": 0.0001,
      "num_tokens": 78701713.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2866,
      "step_time": 23.830079551786184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 196.0,
      "completions/mean_terminated_length": 196.0,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.17589811980724335,
      "epoch": 0.13279295970356647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17251649498939514,
      "kl": 0.021240927278995514,
      "learning_rate": 9.734506716072254e-07,
      "loss": -0.0168,
      "num_tokens": 78724929.0,
      "reward": 0.9665919542312622,
      "reward_std": 0.13363230228424072,
      "rewards/reward_func/mean": 0.9665919542312622,
      "rewards/reward_func/std": 0.13363230228424072,
      "step": 2867,
      "step_time": 19.739026203751564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 126.1875,
      "completions/mean_terminated_length": 126.1875,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2696906700730324,
      "epoch": 0.13283927744326077,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004241105634719133,
      "kl": 0.00217734751640819,
      "learning_rate": 9.734414080592867e-07,
      "loss": 0.0001,
      "num_tokens": 78745620.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2868,
      "step_time": 13.344957027584314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 202.5,
      "completions/mean_terminated_length": 202.5,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.1787184216082096,
      "epoch": 0.13288559518295506,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020049912855029106,
      "kl": 0.001621152478037402,
      "learning_rate": 9.734321445113479e-07,
      "loss": 0.0001,
      "num_tokens": 78785516.0,
      "reward": 0.03273354470729828,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.03273354470729828,
      "rewards/reward_func/std": 0.0,
      "step": 2869,
      "step_time": 24.221180498600006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 132.3125,
      "completions/mean_terminated_length": 132.3125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.30913008749485016,
      "epoch": 0.1329319129226494,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004960328806191683,
      "kl": 0.002564389316830784,
      "learning_rate": 9.73422880963409e-07,
      "loss": 0.0001,
      "num_tokens": 78805377.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2870,
      "step_time": 14.58257419988513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 174.375,
      "completions/mean_terminated_length": 174.375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.14191536977887154,
      "epoch": 0.13297823066234368,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001973430858924985,
      "kl": 0.0014234594709705561,
      "learning_rate": 9.734136174154701e-07,
      "loss": 0.0001,
      "num_tokens": 78828215.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 2871,
      "step_time": 18.145116105675697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 354.0,
      "completions/max_terminated_length": 354.0,
      "completions/mean_length": 306.0,
      "completions/mean_terminated_length": 306.0,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "entropy": 0.19115526601672173,
      "epoch": 0.13302454840203798,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09391110390424728,
      "kl": 0.005062502284999937,
      "learning_rate": 9.734043538675312e-07,
      "loss": -0.1028,
      "num_tokens": 78855943.0,
      "reward": 0.7210334539413452,
      "reward_std": 0.20943774282932281,
      "rewards/reward_func/mean": 0.7210334539413452,
      "rewards/reward_func/std": 0.20943774282932281,
      "step": 2872,
      "step_time": 29.053698629140854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 160.6875,
      "completions/mean_terminated_length": 160.6875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.17224779352545738,
      "epoch": 0.13307086614173227,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007179053034633398,
      "kl": 0.004441243247129023,
      "learning_rate": 9.733950903195924e-07,
      "loss": 0.0002,
      "num_tokens": 78888802.0,
      "reward": 0.38776010274887085,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.38776010274887085,
      "rewards/reward_func/std": 0.0,
      "step": 2873,
      "step_time": 18.88411708176136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 191.125,
      "completions/mean_terminated_length": 191.125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.370157815515995,
      "epoch": 0.1331171838814266,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00280299736186862,
      "kl": 0.002772213250864297,
      "learning_rate": 9.733858267716535e-07,
      "loss": 0.0001,
      "num_tokens": 78919604.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2874,
      "step_time": 20.773690421134233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 674.0,
      "completions/max_terminated_length": 674.0,
      "completions/mean_length": 376.5625,
      "completions/mean_terminated_length": 376.5625,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "entropy": 0.33804337680339813,
      "epoch": 0.1331635016211209,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07376828789710999,
      "kl": 0.004882515640929341,
      "learning_rate": 9.733765632237146e-07,
      "loss": -0.2977,
      "num_tokens": 78953805.0,
      "reward": 0.07358209788799286,
      "reward_std": 0.1180819422006607,
      "rewards/reward_func/mean": 0.07358209788799286,
      "rewards/reward_func/std": 0.1180819496512413,
      "step": 2875,
      "step_time": 53.88939857855439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 175.0,
      "completions/mean_terminated_length": 175.0,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.18610315024852753,
      "epoch": 0.1332098193608152,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001507364446297288,
      "kl": 0.0011621343437582254,
      "learning_rate": 9.733672996757757e-07,
      "loss": 0.0001,
      "num_tokens": 78990909.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 2876,
      "step_time": 21.398020897060633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 144.25,
      "completions/mean_terminated_length": 144.25,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.15696774795651436,
      "epoch": 0.1332561371005095,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017617812845855951,
      "kl": 0.0013164619158487767,
      "learning_rate": 9.733580361278369e-07,
      "loss": 0.0001,
      "num_tokens": 79027377.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2877,
      "step_time": 19.813029501587152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 143.0,
      "completions/mean_terminated_length": 143.0,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.26674485206604004,
      "epoch": 0.1333024548402038,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01314203254878521,
      "kl": 0.0038373583811335266,
      "learning_rate": 9.73348772579898e-07,
      "loss": 0.0002,
      "num_tokens": 79047697.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2878,
      "step_time": 15.462935660034418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 370.0,
      "completions/max_terminated_length": 370.0,
      "completions/mean_length": 244.9375,
      "completions/mean_terminated_length": 244.9375,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.3274817541241646,
      "epoch": 0.1333487725798981,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08200172334909439,
      "kl": 0.017387705855071545,
      "learning_rate": 9.733395090319591e-07,
      "loss": 0.0394,
      "num_tokens": 79085632.0,
      "reward": 0.5542079210281372,
      "reward_std": 0.3619919419288635,
      "rewards/reward_func/mean": 0.5542079210281372,
      "rewards/reward_func/std": 0.3619919419288635,
      "step": 2879,
      "step_time": 33.15012853220105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 165.5,
      "completions/mean_terminated_length": 165.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.23618333414196968,
      "epoch": 0.1333950903195924,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09502062201499939,
      "kl": 0.0038849368575029075,
      "learning_rate": 9.733302454840202e-07,
      "loss": -0.0665,
      "num_tokens": 79113304.0,
      "reward": 0.018163681030273438,
      "reward_std": 0.05706524848937988,
      "rewards/reward_func/mean": 0.018163681030273438,
      "rewards/reward_func/std": 0.057065244764089584,
      "step": 2880,
      "step_time": 19.765874680131674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 163.125,
      "completions/mean_terminated_length": 163.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.18659117072820663,
      "epoch": 0.1334414080592867,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001981121487915516,
      "kl": 0.0014658432046417147,
      "learning_rate": 9.733209819360816e-07,
      "loss": 0.0001,
      "num_tokens": 79153338.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 2881,
      "step_time": 21.366734340786934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 140.5,
      "completions/mean_terminated_length": 140.5,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.25292740762233734,
      "epoch": 0.13348772579898102,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004230554215610027,
      "kl": 0.0022894427529536188,
      "learning_rate": 9.733117183881427e-07,
      "loss": 0.0001,
      "num_tokens": 79173026.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2882,
      "step_time": 14.935656115412712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 243.125,
      "completions/mean_terminated_length": 243.125,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "entropy": 0.16847683116793633,
      "epoch": 0.13353404353867532,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0687066838145256,
      "kl": 0.0009277532808482647,
      "learning_rate": 9.733024548402038e-07,
      "loss": -0.0133,
      "num_tokens": 79220244.0,
      "reward": 0.9978815317153931,
      "reward_std": 0.008473753929138184,
      "rewards/reward_func/mean": 0.9978815317153931,
      "rewards/reward_func/std": 0.008473754860460758,
      "step": 2883,
      "step_time": 28.099848553538322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 183.25,
      "completions/mean_terminated_length": 183.25,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.31974637508392334,
      "epoch": 0.1335803612783696,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0881740152835846,
      "kl": 0.006744670565240085,
      "learning_rate": 9.73293191292265e-07,
      "loss": 0.0348,
      "num_tokens": 79243304.0,
      "reward": 0.06255227327346802,
      "reward_std": 0.24998606741428375,
      "rewards/reward_func/mean": 0.06255227327346802,
      "rewards/reward_func/std": 0.24998606741428375,
      "step": 2884,
      "step_time": 21.40220644325018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 300.4375,
      "completions/mean_terminated_length": 300.4375,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "entropy": 0.1885397471487522,
      "epoch": 0.1336266790180639,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.05555471405386925,
      "kl": 0.0034118599724024534,
      "learning_rate": 9.73283927744326e-07,
      "loss": -0.0385,
      "num_tokens": 79283215.0,
      "reward": 0.9948967695236206,
      "reward_std": 0.01394471526145935,
      "rewards/reward_func/mean": 0.9948967695236206,
      "rewards/reward_func/std": 0.013944721780717373,
      "step": 2885,
      "step_time": 30.627264350652695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 175.625,
      "completions/mean_terminated_length": 175.625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.1396532915532589,
      "epoch": 0.13367299675775823,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006407846696674824,
      "kl": 0.024945903103798628,
      "learning_rate": 9.732746641963872e-07,
      "loss": 0.0012,
      "num_tokens": 79312025.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2886,
      "step_time": 19.252801068127155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.43986694514751434,
      "epoch": 0.13371931449745253,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020557146053761244,
      "kl": 0.0019736994290724397,
      "learning_rate": 9.732654006484483e-07,
      "loss": 0.0001,
      "num_tokens": 79357915.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2887,
      "step_time": 23.858646240085363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 167.375,
      "completions/mean_terminated_length": 167.375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.1891889125108719,
      "epoch": 0.13376563223714683,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004160380456596613,
      "kl": 0.003076967317610979,
      "learning_rate": 9.732561371005094e-07,
      "loss": 0.0002,
      "num_tokens": 79379969.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2888,
      "step_time": 18.755075678229332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 143.0625,
      "completions/mean_terminated_length": 143.0625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.33918070793151855,
      "epoch": 0.13381194997684112,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002751731313765049,
      "kl": 0.0021560149907600135,
      "learning_rate": 9.732468735525706e-07,
      "loss": 0.0001,
      "num_tokens": 79416082.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2889,
      "step_time": 18.360064450651407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 182.875,
      "completions/mean_terminated_length": 182.875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4022962525486946,
      "epoch": 0.13385826771653545,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004516348708420992,
      "kl": 0.0032430451828986406,
      "learning_rate": 9.732376100046317e-07,
      "loss": 0.0002,
      "num_tokens": 79437456.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2890,
      "step_time": 19.23727685213089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 149.4375,
      "completions/mean_terminated_length": 149.4375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.45322011411190033,
      "epoch": 0.13390458545622974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002992769004777074,
      "kl": 0.0031186541309580207,
      "learning_rate": 9.732283464566928e-07,
      "loss": 0.0002,
      "num_tokens": 79478807.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2891,
      "step_time": 20.694837115705013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 166.75,
      "completions/mean_terminated_length": 166.75,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.37650372087955475,
      "epoch": 0.13395090319592404,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00413138372823596,
      "kl": 0.002810453821439296,
      "learning_rate": 9.73219082908754e-07,
      "loss": 0.0001,
      "num_tokens": 79505987.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2892,
      "step_time": 19.3064882196486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 306.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 279.0,
      "completions/mean_terminated_length": 279.0,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "entropy": 0.25057096034288406,
      "epoch": 0.13399722093561833,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0678534209728241,
      "kl": 0.0033562793396413326,
      "learning_rate": 9.73209819360815e-07,
      "loss": -0.0279,
      "num_tokens": 79531731.0,
      "reward": 0.6970856189727783,
      "reward_std": 0.14487607777118683,
      "rewards/reward_func/mean": 0.6970856189727783,
      "rewards/reward_func/std": 0.14487609267234802,
      "step": 2893,
      "step_time": 25.717489823698997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 160.5625,
      "completions/mean_terminated_length": 160.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.21365992724895477,
      "epoch": 0.13404353867531266,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014544484438374639,
      "kl": 0.0009965770732378587,
      "learning_rate": 9.732005558128764e-07,
      "loss": 0.0,
      "num_tokens": 79562892.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2894,
      "step_time": 19.14187029749155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 127.375,
      "completions/mean_terminated_length": 127.375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.22835980728268623,
      "epoch": 0.13408985641500695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021027359180152416,
      "kl": 0.0016507274995092303,
      "learning_rate": 9.731912922649375e-07,
      "loss": 0.0001,
      "num_tokens": 79583170.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2895,
      "step_time": 15.572048984467983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 127.1875,
      "completions/mean_terminated_length": 127.1875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2573355510830879,
      "epoch": 0.13413617415470125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003158706473186612,
      "kl": 0.0019519127672538161,
      "learning_rate": 9.731820287169987e-07,
      "loss": 0.0001,
      "num_tokens": 79603029.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2896,
      "step_time": 14.71007889136672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 144.9375,
      "completions/mean_terminated_length": 144.9375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2831461504101753,
      "epoch": 0.13418249189439554,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10067969560623169,
      "kl": 0.003846227133180946,
      "learning_rate": 9.731727651690596e-07,
      "loss": -0.0102,
      "num_tokens": 79625508.0,
      "reward": 0.9107850790023804,
      "reward_std": 0.023790644481778145,
      "rewards/reward_func/mean": 0.9107850790023804,
      "rewards/reward_func/std": 0.023790642619132996,
      "step": 2897,
      "step_time": 16.928314816206694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 179.1875,
      "completions/mean_terminated_length": 179.1875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.43180882185697556,
      "epoch": 0.13422880963408987,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006570231169462204,
      "kl": 0.00406917673535645,
      "learning_rate": 9.73163501621121e-07,
      "loss": 0.0002,
      "num_tokens": 79651943.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2898,
      "step_time": 20.983196452260017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 126.0625,
      "completions/mean_terminated_length": 126.0625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.3219418525695801,
      "epoch": 0.13427512737378416,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006617399863898754,
      "kl": 0.002424533828161657,
      "learning_rate": 9.73154238073182e-07,
      "loss": 0.0001,
      "num_tokens": 79678040.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2899,
      "step_time": 16.985364101827145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 129.5625,
      "completions/mean_terminated_length": 129.5625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.4039051756262779,
      "epoch": 0.13432144511347846,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031726134475320578,
      "kl": 0.002316710742888972,
      "learning_rate": 9.731449745252432e-07,
      "loss": 0.0001,
      "num_tokens": 79710337.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2900,
      "step_time": 15.91283344477415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 167.125,
      "completions/mean_terminated_length": 167.125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.22085124626755714,
      "epoch": 0.13436776285317276,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11789902299642563,
      "kl": 0.0022243360290303826,
      "learning_rate": 9.731357109773043e-07,
      "loss": 0.0324,
      "num_tokens": 79731379.0,
      "reward": 0.8954626321792603,
      "reward_std": 0.04066455364227295,
      "rewards/reward_func/mean": 0.8954626321792603,
      "rewards/reward_func/std": 0.04066455364227295,
      "step": 2901,
      "step_time": 17.55325523391366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 313.0,
      "completions/max_terminated_length": 313.0,
      "completions/mean_length": 241.1875,
      "completions/mean_terminated_length": 241.1875,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.23715194314718246,
      "epoch": 0.13441408059286708,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006826833356171846,
      "kl": 0.008999955840408802,
      "learning_rate": 9.731264474293654e-07,
      "loss": 0.0004,
      "num_tokens": 79757510.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2902,
      "step_time": 26.31505984812975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 119.3125,
      "completions/mean_terminated_length": 119.3125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.3403228744864464,
      "epoch": 0.13446039833256138,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004008160438388586,
      "kl": 0.0024982955947052687,
      "learning_rate": 9.731171838814265e-07,
      "loss": 0.0001,
      "num_tokens": 79777803.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2903,
      "step_time": 13.930200260132551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 201.25,
      "completions/mean_terminated_length": 201.25,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.4237442836165428,
      "epoch": 0.13450671607225567,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005169948097318411,
      "kl": 0.004285382223315537,
      "learning_rate": 9.731079203334877e-07,
      "loss": 0.0002,
      "num_tokens": 79812303.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2904,
      "step_time": 23.392813712358475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 344.0,
      "completions/max_terminated_length": 344.0,
      "completions/mean_length": 270.125,
      "completions/mean_terminated_length": 270.125,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.49060673266649246,
      "epoch": 0.13455303381194997,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07705912739038467,
      "kl": 0.0030304257525131106,
      "learning_rate": 9.730986567855488e-07,
      "loss": 0.089,
      "num_tokens": 79837057.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 2905,
      "step_time": 27.92110213637352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 130.3125,
      "completions/mean_terminated_length": 130.3125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2768367826938629,
      "epoch": 0.1345993515516443,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007510158233344555,
      "kl": 0.003734195663128048,
      "learning_rate": 9.7308939323761e-07,
      "loss": 0.0002,
      "num_tokens": 79857238.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2906,
      "step_time": 14.888576116412878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 175.1875,
      "completions/mean_terminated_length": 175.1875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.269774716347456,
      "epoch": 0.1346456692913386,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011981590650975704,
      "kl": 0.008176260511390865,
      "learning_rate": 9.73080129689671e-07,
      "loss": 0.0004,
      "num_tokens": 79883369.0,
      "reward": 0.08882806450128555,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.08882806450128555,
      "rewards/reward_func/std": 0.0,
      "step": 2907,
      "step_time": 18.90694124996662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 206.6875,
      "completions/mean_terminated_length": 206.6875,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.20256155729293823,
      "epoch": 0.13469198703103288,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006589134223759174,
      "kl": 0.005259062745608389,
      "learning_rate": 9.730708661417324e-07,
      "loss": 0.0003,
      "num_tokens": 79909540.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2908,
      "step_time": 20.471765104681253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 236.875,
      "completions/mean_terminated_length": 236.875,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.2770218923687935,
      "epoch": 0.13473830477072718,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0778183862566948,
      "kl": 0.007008157903328538,
      "learning_rate": 9.730616025937933e-07,
      "loss": -0.0185,
      "num_tokens": 79934066.0,
      "reward": 0.7777365446090698,
      "reward_std": 0.27077364921569824,
      "rewards/reward_func/mean": 0.7777365446090698,
      "rewards/reward_func/std": 0.27077367901802063,
      "step": 2909,
      "step_time": 22.851427253335714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 161.5,
      "completions/mean_terminated_length": 161.5,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.40292152017354965,
      "epoch": 0.1347846225104215,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015367643209174275,
      "kl": 0.0018360107787884772,
      "learning_rate": 9.730523390458544e-07,
      "loss": 0.0001,
      "num_tokens": 79970778.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2910,
      "step_time": 21.00841509178281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 195.0,
      "completions/mean_terminated_length": 195.0,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.45231635868549347,
      "epoch": 0.1348309402501158,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012612712569534779,
      "kl": 0.005916845519095659,
      "learning_rate": 9.730430754979157e-07,
      "loss": 0.0003,
      "num_tokens": 80022730.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2911,
      "step_time": 27.85970949009061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 232.6875,
      "completions/mean_terminated_length": 232.6875,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.32124925404787064,
      "epoch": 0.1348772579898101,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08090087026357651,
      "kl": 0.002967926091514528,
      "learning_rate": 9.730338119499769e-07,
      "loss": 0.0165,
      "num_tokens": 80045237.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 2912,
      "step_time": 21.96769331395626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 147.3125,
      "completions/mean_terminated_length": 147.3125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.24899177998304367,
      "epoch": 0.1349235757295044,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038116220384836197,
      "kl": 0.0020863708632532507,
      "learning_rate": 9.73024548402038e-07,
      "loss": 0.0001,
      "num_tokens": 80068650.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2913,
      "step_time": 16.062062311917543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 185.25,
      "completions/mean_terminated_length": 185.25,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.4191589877009392,
      "epoch": 0.13496989346919872,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005117417778819799,
      "kl": 0.003462464897893369,
      "learning_rate": 9.730152848540991e-07,
      "loss": 0.0002,
      "num_tokens": 80098574.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2914,
      "step_time": 21.375920746475458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 385.0,
      "completions/max_terminated_length": 385.0,
      "completions/mean_length": 318.625,
      "completions/mean_terminated_length": 318.625,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "entropy": 0.29498157650232315,
      "epoch": 0.135016211208893,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0670662447810173,
      "kl": 0.005866886465810239,
      "learning_rate": 9.730060213061602e-07,
      "loss": -0.1551,
      "num_tokens": 80128648.0,
      "reward": 0.20415154099464417,
      "reward_std": 0.15054519474506378,
      "rewards/reward_func/mean": 0.20415154099464417,
      "rewards/reward_func/std": 0.15054519474506378,
      "step": 2915,
      "step_time": 31.901460755616426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 129.0,
      "completions/mean_terminated_length": 129.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.29677698016166687,
      "epoch": 0.1350625289485873,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028735266532748938,
      "kl": 0.0016636458167340606,
      "learning_rate": 9.729967577582214e-07,
      "loss": 0.0001,
      "num_tokens": 80156136.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2916,
      "step_time": 16.00371688231826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 123.4375,
      "completions/mean_terminated_length": 123.4375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.30348125100135803,
      "epoch": 0.1351088466882816,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021966758649796247,
      "kl": 0.001968748401850462,
      "learning_rate": 9.729874942102825e-07,
      "loss": 0.0001,
      "num_tokens": 80180159.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2917,
      "step_time": 14.408776670694351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 148.25,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.199592687189579,
      "epoch": 0.13515516442797593,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033022004645317793,
      "kl": 0.0021382744307629764,
      "learning_rate": 9.729782306623436e-07,
      "loss": 0.0001,
      "num_tokens": 80200947.0,
      "reward": 0.8507331609725952,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8507331609725952,
      "rewards/reward_func/std": 0.0,
      "step": 2918,
      "step_time": 15.433997303247452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 172.9375,
      "completions/mean_terminated_length": 172.9375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.40478938817977905,
      "epoch": 0.13520148216767022,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042166030034422874,
      "kl": 0.0031098597100935876,
      "learning_rate": 9.729689671144047e-07,
      "loss": 0.0002,
      "num_tokens": 80222322.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2919,
      "step_time": 17.935936015099287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 192.0625,
      "completions/mean_terminated_length": 192.0625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.24535804614424706,
      "epoch": 0.13524779990736452,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034478441812098026,
      "kl": 0.0026309596141800284,
      "learning_rate": 9.729597035664659e-07,
      "loss": 0.0001,
      "num_tokens": 80243715.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2920,
      "step_time": 18.642922777682543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 140.75,
      "completions/mean_terminated_length": 140.75,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.32133422791957855,
      "epoch": 0.13529411764705881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026950486935675144,
      "kl": 0.002045614004600793,
      "learning_rate": 9.72950440018527e-07,
      "loss": 0.0001,
      "num_tokens": 80264239.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2921,
      "step_time": 15.687923986464739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 148.25,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.34213197976350784,
      "epoch": 0.13534043538675314,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012444699183106422,
      "kl": 0.0053047959809191525,
      "learning_rate": 9.729411764705881e-07,
      "loss": 0.0003,
      "num_tokens": 80300755.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2922,
      "step_time": 20.790783379226923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 141.875,
      "completions/mean_terminated_length": 141.875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.1986936368048191,
      "epoch": 0.13538675312644743,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037879047449678183,
      "kl": 0.002654906886164099,
      "learning_rate": 9.729319129226492e-07,
      "loss": 0.0001,
      "num_tokens": 80321153.0,
      "reward": 0.21578796207904816,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.21578796207904816,
      "rewards/reward_func/std": 0.0,
      "step": 2923,
      "step_time": 15.273275960236788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 136.375,
      "completions/mean_terminated_length": 136.375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.24271875619888306,
      "epoch": 0.13543307086614173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003306017490103841,
      "kl": 0.0020956738444510847,
      "learning_rate": 9.729226493747106e-07,
      "loss": 0.0001,
      "num_tokens": 80340743.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2924,
      "step_time": 14.392927952110767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 160.125,
      "completions/mean_terminated_length": 160.125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.19377048686146736,
      "epoch": 0.13547938860583603,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15734557807445526,
      "kl": 0.0018305819248780608,
      "learning_rate": 9.729133858267717e-07,
      "loss": 0.0111,
      "num_tokens": 80365977.0,
      "reward": 0.34278643131256104,
      "reward_std": 0.007662854623049498,
      "rewards/reward_func/mean": 0.34278643131256104,
      "rewards/reward_func/std": 0.007662852294743061,
      "step": 2925,
      "step_time": 16.507501907646656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 222.75,
      "completions/mean_terminated_length": 222.75,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.2721988260746002,
      "epoch": 0.13552570634553035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0854281559586525,
      "kl": 0.006537881796248257,
      "learning_rate": 9.729041222788328e-07,
      "loss": 0.058,
      "num_tokens": 80390437.0,
      "reward": 0.7393002510070801,
      "reward_std": 0.006071718409657478,
      "rewards/reward_func/mean": 0.7393002510070801,
      "rewards/reward_func/std": 0.006071717012673616,
      "step": 2926,
      "step_time": 23.67028560861945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 219.375,
      "completions/mean_terminated_length": 219.375,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.4652087688446045,
      "epoch": 0.13557202408522465,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008797567337751389,
      "kl": 0.006701090256683528,
      "learning_rate": 9.72894858730894e-07,
      "loss": 0.0003,
      "num_tokens": 80421643.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2927,
      "step_time": 22.19981612637639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 182.375,
      "completions/mean_terminated_length": 182.375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.17625219747424126,
      "epoch": 0.13561834182491894,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08175390213727951,
      "kl": 0.001726919668726623,
      "learning_rate": 9.72885595182955e-07,
      "loss": 0.0101,
      "num_tokens": 80444321.0,
      "reward": 0.9493370056152344,
      "reward_std": 0.013510131277143955,
      "rewards/reward_func/mean": 0.9493370056152344,
      "rewards/reward_func/std": 0.013510138727724552,
      "step": 2928,
      "step_time": 18.106609422713518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 332.0,
      "completions/max_terminated_length": 332.0,
      "completions/mean_length": 240.875,
      "completions/mean_terminated_length": 240.875,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.3816415071487427,
      "epoch": 0.13566465956461324,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08109243214130402,
      "kl": 0.00977090816013515,
      "learning_rate": 9.728763316350162e-07,
      "loss": -0.0956,
      "num_tokens": 80482319.0,
      "reward": 0.44238895177841187,
      "reward_std": 0.4039229154586792,
      "rewards/reward_func/mean": 0.44238895177841187,
      "rewards/reward_func/std": 0.4039229452610016,
      "step": 2929,
      "step_time": 30.66715943813324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 166.4375,
      "completions/mean_terminated_length": 166.4375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.2888646200299263,
      "epoch": 0.13571097730430756,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12389273196458817,
      "kl": 0.0038977753720246255,
      "learning_rate": 9.728670680870773e-07,
      "loss": -0.0277,
      "num_tokens": 80502486.0,
      "reward": 0.5625,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.5625,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 2930,
      "step_time": 16.247186593711376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 137.125,
      "completions/mean_terminated_length": 137.125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.32616279274225235,
      "epoch": 0.13575729504400186,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002884137211367488,
      "kl": 0.0022058217437006533,
      "learning_rate": 9.728578045391384e-07,
      "loss": 0.0001,
      "num_tokens": 80522824.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2931,
      "step_time": 14.713385853916407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 292.625,
      "completions/mean_terminated_length": 292.625,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "entropy": 0.3592124730348587,
      "epoch": 0.13580361278369615,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07550641894340515,
      "kl": 0.0037497011944651604,
      "learning_rate": 9.728485409911996e-07,
      "loss": -0.1445,
      "num_tokens": 80553986.0,
      "reward": 0.5031079649925232,
      "reward_std": 0.40248632431030273,
      "rewards/reward_func/mean": 0.5031079649925232,
      "rewards/reward_func/std": 0.4024863541126251,
      "step": 2932,
      "step_time": 32.35196726769209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 115.5625,
      "completions/mean_terminated_length": 115.5625,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.27754029631614685,
      "epoch": 0.13584993052339045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028542224317789078,
      "kl": 0.0020283262128941715,
      "learning_rate": 9.728392774432607e-07,
      "loss": 0.0001,
      "num_tokens": 80575435.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2933,
      "step_time": 13.490512564778328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 149.0,
      "completions/mean_terminated_length": 149.0,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3260805085301399,
      "epoch": 0.13589624826308477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003782173851504922,
      "kl": 0.002757948881480843,
      "learning_rate": 9.728300138953218e-07,
      "loss": 0.0001,
      "num_tokens": 80598459.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2934,
      "step_time": 16.095183614641428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 120.0,
      "completions/max_terminated_length": 120.0,
      "completions/mean_length": 100.8125,
      "completions/mean_terminated_length": 100.8125,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "entropy": 0.26286860555410385,
      "epoch": 0.13594256600277907,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003110466292127967,
      "kl": 0.0018810816982295364,
      "learning_rate": 9.72820750347383e-07,
      "loss": 0.0001,
      "num_tokens": 80617816.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2935,
      "step_time": 11.666656825691462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 154.3125,
      "completions/mean_terminated_length": 154.3125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.16198613867163658,
      "epoch": 0.13598888374247337,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013472113059833646,
      "kl": 0.0010267670004395768,
      "learning_rate": 9.72811486799444e-07,
      "loss": 0.0001,
      "num_tokens": 80649581.0,
      "reward": 0.25572916865348816,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.25572916865348816,
      "rewards/reward_func/std": 0.0,
      "step": 2936,
      "step_time": 18.152310617268085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 152.0,
      "completions/mean_terminated_length": 152.0,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.4010363295674324,
      "epoch": 0.13603520148216766,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021539137233048677,
      "kl": 0.0020854767644777894,
      "learning_rate": 9.728022232515052e-07,
      "loss": 0.0001,
      "num_tokens": 80691981.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2937,
      "step_time": 23.469403725117445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 190.4375,
      "completions/mean_terminated_length": 190.4375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.43749643862247467,
      "epoch": 0.13608151922186199,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036167947109788656,
      "kl": 0.003282747173216194,
      "learning_rate": 9.727929597035665e-07,
      "loss": 0.0002,
      "num_tokens": 80715572.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2938,
      "step_time": 20.30497931689024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 154.9375,
      "completions/mean_terminated_length": 154.9375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.22568828985095024,
      "epoch": 0.13612783696155628,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001811955007724464,
      "kl": 0.001611333544133231,
      "learning_rate": 9.727836961556277e-07,
      "loss": 0.0001,
      "num_tokens": 80747731.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2939,
      "step_time": 18.38596522435546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 142.3125,
      "completions/mean_terminated_length": 142.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.32939816266298294,
      "epoch": 0.13617415470125058,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026357651222497225,
      "kl": 0.001972697500605136,
      "learning_rate": 9.727744326076886e-07,
      "loss": 0.0001,
      "num_tokens": 80774312.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2940,
      "step_time": 16.551889911293983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 134.0,
      "completions/mean_terminated_length": 134.0,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.30944251269102097,
      "epoch": 0.13622047244094487,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035243777092546225,
      "kl": 0.002230473415693268,
      "learning_rate": 9.7276516905975e-07,
      "loss": 0.0001,
      "num_tokens": 80800472.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2941,
      "step_time": 16.00205109268427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 206.75,
      "completions/mean_terminated_length": 206.75,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.2383054681122303,
      "epoch": 0.1362667901806392,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010360358282923698,
      "kl": 0.003712862846441567,
      "learning_rate": 9.72755905511811e-07,
      "loss": 0.0002,
      "num_tokens": 80823892.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2942,
      "step_time": 20.769436541944742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 173.6875,
      "completions/mean_terminated_length": 173.6875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.1985713616013527,
      "epoch": 0.1363131079203335,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1353738158941269,
      "kl": 0.002829553181072697,
      "learning_rate": 9.727466419638722e-07,
      "loss": -0.0043,
      "num_tokens": 80851615.0,
      "reward": 0.571158766746521,
      "reward_std": 0.0052941846661269665,
      "rewards/reward_func/mean": 0.571158766746521,
      "rewards/reward_func/std": 0.005294173490256071,
      "step": 2943,
      "step_time": 19.30748025327921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 173.375,
      "completions/mean_terminated_length": 173.375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.18440937623381615,
      "epoch": 0.1363594256600278,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10002607852220535,
      "kl": 0.0036318711936473846,
      "learning_rate": 9.727373784159333e-07,
      "loss": -0.0119,
      "num_tokens": 80879685.0,
      "reward": 0.9840062260627747,
      "reward_std": 0.06397509574890137,
      "rewards/reward_func/mean": 0.9840062260627747,
      "rewards/reward_func/std": 0.06397509574890137,
      "step": 2944,
      "step_time": 18.639881521463394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 195.375,
      "completions/mean_terminated_length": 195.375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.2200978547334671,
      "epoch": 0.13640574339972208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003600543364882469,
      "kl": 0.0021891340729780495,
      "learning_rate": 9.727281148679944e-07,
      "loss": 0.0001,
      "num_tokens": 80901147.0,
      "reward": 0.6976763010025024,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6976763010025024,
      "rewards/reward_func/std": 0.0,
      "step": 2945,
      "step_time": 19.604786157608032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 116.75,
      "completions/mean_terminated_length": 116.75,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.25810839980840683,
      "epoch": 0.1364520611394164,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025552993174642324,
      "kl": 0.0018019027484115213,
      "learning_rate": 9.727188513200555e-07,
      "loss": 0.0001,
      "num_tokens": 80924151.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2946,
      "step_time": 14.32331271842122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.0,
      "completions/max_terminated_length": 127.0,
      "completions/mean_length": 107.5,
      "completions/mean_terminated_length": 107.5,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 0.2829029783606529,
      "epoch": 0.1364983788791107,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002424074336886406,
      "kl": 0.0014853333414066583,
      "learning_rate": 9.727095877721167e-07,
      "loss": 0.0001,
      "num_tokens": 80943903.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2947,
      "step_time": 12.356964226812124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 193.75,
      "completions/mean_terminated_length": 193.75,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.24649735167622566,
      "epoch": 0.136544696618805,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09459095448255539,
      "kl": 0.003821521357167512,
      "learning_rate": 9.727003242241778e-07,
      "loss": -0.0151,
      "num_tokens": 80966347.0,
      "reward": 0.9810665845870972,
      "reward_std": 0.02900378406047821,
      "rewards/reward_func/mean": 0.9810665845870972,
      "rewards/reward_func/std": 0.02900378406047821,
      "step": 2948,
      "step_time": 19.565968200564384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 309.0,
      "completions/max_terminated_length": 309.0,
      "completions/mean_length": 208.875,
      "completions/mean_terminated_length": 208.875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.4957779496908188,
      "epoch": 0.1365910143584993,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11119517683982849,
      "kl": 0.002874163561500609,
      "learning_rate": 9.72691060676239e-07,
      "loss": 0.1147,
      "num_tokens": 80990121.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 2949,
      "step_time": 25.593505263328552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 124.9375,
      "completions/mean_terminated_length": 124.9375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.28258949145674706,
      "epoch": 0.13663733209819362,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003415545215830207,
      "kl": 0.0023273377446457744,
      "learning_rate": 9.726817971283e-07,
      "loss": 0.0001,
      "num_tokens": 81009608.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2950,
      "step_time": 13.205769643187523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 474.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 438.25,
      "completions/mean_terminated_length": 438.25,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "entropy": 0.16405363380908966,
      "epoch": 0.13668364983788792,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016253968933597207,
      "kl": 0.001384514122037217,
      "learning_rate": 9.726725335803614e-07,
      "loss": 0.0001,
      "num_tokens": 81045068.0,
      "reward": 0.6524353623390198,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6524353623390198,
      "rewards/reward_func/std": 0.0,
      "step": 2951,
      "step_time": 39.95747434720397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 141.5,
      "completions/mean_terminated_length": 141.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.29256442934274673,
      "epoch": 0.1367299675775822,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003066990291699767,
      "kl": 0.0021466032485477626,
      "learning_rate": 9.726632700324223e-07,
      "loss": 0.0001,
      "num_tokens": 81068500.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2952,
      "step_time": 16.507258500903845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 152.5625,
      "completions/mean_terminated_length": 152.5625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.2603667229413986,
      "epoch": 0.1367762853172765,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.016266319900751114,
      "kl": 0.0039931878563947976,
      "learning_rate": 9.726540064844834e-07,
      "loss": 0.0002,
      "num_tokens": 81089293.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2953,
      "step_time": 15.947432920336723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 111.0,
      "completions/mean_terminated_length": 111.0,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.32477306574583054,
      "epoch": 0.13682260305697083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005752775352448225,
      "kl": 0.0026571782655082643,
      "learning_rate": 9.726447429365445e-07,
      "loss": 0.0001,
      "num_tokens": 81113885.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2954,
      "step_time": 14.212970558553934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 172.125,
      "completions/mean_terminated_length": 172.125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.265098437666893,
      "epoch": 0.13686892079666513,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002817682223394513,
      "kl": 0.0022179307125043124,
      "learning_rate": 9.726354793886059e-07,
      "loss": 0.0001,
      "num_tokens": 81137215.0,
      "reward": 0.7376042604446411,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7376042604446411,
      "rewards/reward_func/std": 0.0,
      "step": 2955,
      "step_time": 18.107690293341875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 130.3125,
      "completions/mean_terminated_length": 130.3125,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.32519863545894623,
      "epoch": 0.13691523853635942,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002365069929510355,
      "kl": 0.00207607468473725,
      "learning_rate": 9.72626215840667e-07,
      "loss": 0.0001,
      "num_tokens": 81161124.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2956,
      "step_time": 14.70854127407074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 305.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 261.625,
      "completions/mean_terminated_length": 261.625,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "entropy": 0.2559128552675247,
      "epoch": 0.13696155627605372,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018688369309529662,
      "kl": 0.0015516452840529382,
      "learning_rate": 9.726169522927281e-07,
      "loss": 0.0001,
      "num_tokens": 81188606.0,
      "reward": 0.7738244533538818,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7738244533538818,
      "rewards/reward_func/std": 0.0,
      "step": 2957,
      "step_time": 26.525179404765368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 291.0,
      "completions/max_terminated_length": 291.0,
      "completions/mean_length": 205.1875,
      "completions/mean_terminated_length": 205.1875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3751988708972931,
      "epoch": 0.13700787401574804,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10362628102302551,
      "kl": 0.005429615383036435,
      "learning_rate": 9.726076887447892e-07,
      "loss": -0.1505,
      "num_tokens": 81210945.0,
      "reward": 0.31229549646377563,
      "reward_std": 0.41644027829170227,
      "rewards/reward_func/mean": 0.31229549646377563,
      "rewards/reward_func/std": 0.41644027829170227,
      "step": 2958,
      "step_time": 24.359685085713863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 211.5,
      "completions/mean_terminated_length": 211.5,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.23295798897743225,
      "epoch": 0.13705419175544234,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11378776282072067,
      "kl": 0.012075455160811543,
      "learning_rate": 9.725984251968504e-07,
      "loss": -0.0314,
      "num_tokens": 81248777.0,
      "reward": 0.9636383056640625,
      "reward_std": 0.09935915470123291,
      "rewards/reward_func/mean": 0.9636383056640625,
      "rewards/reward_func/std": 0.09935914725065231,
      "step": 2959,
      "step_time": 24.62277100980282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 137.875,
      "completions/mean_terminated_length": 137.875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.3242144212126732,
      "epoch": 0.13710050949513664,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004124994855374098,
      "kl": 0.0021949128131382167,
      "learning_rate": 9.725891616489115e-07,
      "loss": 0.0001,
      "num_tokens": 81274503.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2960,
      "step_time": 16.611236076802015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 165.625,
      "completions/mean_terminated_length": 165.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3944648876786232,
      "epoch": 0.13714682723483093,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020955982618033886,
      "kl": 0.00184916183934547,
      "learning_rate": 9.725798981009726e-07,
      "loss": 0.0001,
      "num_tokens": 81309185.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2961,
      "step_time": 19.850964810699224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 177.125,
      "completions/mean_terminated_length": 177.125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.22054476290941238,
      "epoch": 0.13719314497452526,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029245137702673674,
      "kl": 0.0024541362072341144,
      "learning_rate": 9.725706345530337e-07,
      "loss": 0.0001,
      "num_tokens": 81342803.0,
      "reward": 0.581777811050415,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.581777811050415,
      "rewards/reward_func/std": 0.0,
      "step": 2962,
      "step_time": 20.51937496289611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 155.375,
      "completions/mean_terminated_length": 155.375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.23320022225379944,
      "epoch": 0.13723946271421955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005797238554805517,
      "kl": 0.0036599887534976006,
      "learning_rate": 9.725613710050949e-07,
      "loss": 0.0002,
      "num_tokens": 81371241.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 2963,
      "step_time": 17.888550620526075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 376.0,
      "completions/max_terminated_length": 376.0,
      "completions/mean_length": 289.25,
      "completions/mean_terminated_length": 289.25,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "entropy": 0.30964043736457825,
      "epoch": 0.13728578045391385,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07827834784984589,
      "kl": 0.005486906738951802,
      "learning_rate": 9.725521074571562e-07,
      "loss": -0.1153,
      "num_tokens": 81411757.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 2964,
      "step_time": 34.58116399124265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 154.5625,
      "completions/mean_terminated_length": 154.5625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.21548538655042648,
      "epoch": 0.13733209819360814,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13951675593852997,
      "kl": 0.007275815587490797,
      "learning_rate": 9.725428439092171e-07,
      "loss": -0.0208,
      "num_tokens": 81441718.0,
      "reward": 0.3506828844547272,
      "reward_std": 0.0201385710388422,
      "rewards/reward_func/mean": 0.3506828844547272,
      "rewards/reward_func/std": 0.020138569176197052,
      "step": 2965,
      "step_time": 17.99277178570628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 148.1875,
      "completions/mean_terminated_length": 148.1875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3910243958234787,
      "epoch": 0.13737841593330247,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020106330048292875,
      "kl": 0.002160381118301302,
      "learning_rate": 9.725335803612782e-07,
      "loss": 0.0001,
      "num_tokens": 81495641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2966,
      "step_time": 25.105138290673494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 192.125,
      "completions/mean_terminated_length": 192.125,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.3421904370188713,
      "epoch": 0.13742473367299676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004232991952449083,
      "kl": 0.003913190448656678,
      "learning_rate": 9.725243168133394e-07,
      "loss": 0.0002,
      "num_tokens": 81518379.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2967,
      "step_time": 19.195054043084383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 201.375,
      "completions/mean_terminated_length": 201.375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3975019305944443,
      "epoch": 0.13747105141269106,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021872776560485363,
      "kl": 0.0022378937574103475,
      "learning_rate": 9.725150532654007e-07,
      "loss": 0.0001,
      "num_tokens": 81555297.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2968,
      "step_time": 23.513963136821985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 119.0,
      "completions/mean_terminated_length": 119.0,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2754397690296173,
      "epoch": 0.13751736915238535,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023599236737936735,
      "kl": 0.0016544149548280984,
      "learning_rate": 9.725057897174618e-07,
      "loss": 0.0001,
      "num_tokens": 81577873.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2969,
      "step_time": 14.453000880777836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 258.625,
      "completions/mean_terminated_length": 258.625,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.28134264796972275,
      "epoch": 0.13756368689207968,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08394055813550949,
      "kl": 0.003864644793793559,
      "learning_rate": 9.72496526169523e-07,
      "loss": 0.1294,
      "num_tokens": 81605547.0,
      "reward": 0.6885161995887756,
      "reward_std": 0.2206767350435257,
      "rewards/reward_func/mean": 0.6885161995887756,
      "rewards/reward_func/std": 0.2206767350435257,
      "step": 2970,
      "step_time": 27.560300171375275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 156.625,
      "completions/mean_terminated_length": 156.625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.2121870256960392,
      "epoch": 0.13761000463177397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005776191595941782,
      "kl": 0.003393293300177902,
      "learning_rate": 9.72487262621584e-07,
      "loss": 0.0002,
      "num_tokens": 81626133.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 2971,
      "step_time": 15.72008827328682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 155.6875,
      "completions/mean_terminated_length": 155.6875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3848320543766022,
      "epoch": 0.13765632237146827,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007267037406563759,
      "kl": 0.0029213608358986676,
      "learning_rate": 9.724779990736452e-07,
      "loss": 0.0001,
      "num_tokens": 81653872.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2972,
      "step_time": 19.16296986490488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 143.0625,
      "completions/mean_terminated_length": 143.0625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.2205166332423687,
      "epoch": 0.13770264011116257,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002539976965636015,
      "kl": 0.0017141374410130084,
      "learning_rate": 9.724687355257063e-07,
      "loss": 0.0001,
      "num_tokens": 81673601.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2973,
      "step_time": 13.995499208569527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 132.5,
      "completions/mean_terminated_length": 132.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.252107672393322,
      "epoch": 0.1377489578508569,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003110184334218502,
      "kl": 0.0016299804265145212,
      "learning_rate": 9.724594719777675e-07,
      "loss": 0.0001,
      "num_tokens": 81702361.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2974,
      "step_time": 17.1572062112391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 122.6875,
      "completions/mean_terminated_length": 122.6875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.30890874564647675,
      "epoch": 0.1377952755905512,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00426195515319705,
      "kl": 0.0026575836818665266,
      "learning_rate": 9.724502084298286e-07,
      "loss": 0.0001,
      "num_tokens": 81723316.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2975,
      "step_time": 13.773773342370987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4315086305141449,
      "epoch": 0.13784159333024548,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001540576689876616,
      "kl": 0.0017092112975660712,
      "learning_rate": 9.724409448818897e-07,
      "loss": 0.0001,
      "num_tokens": 81758476.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2976,
      "step_time": 20.798981700092554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 129.1875,
      "completions/mean_terminated_length": 129.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.27639901265501976,
      "epoch": 0.13788791106993978,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001724769128486514,
      "kl": 0.001594531029695645,
      "learning_rate": 9.724316813339508e-07,
      "loss": 0.0001,
      "num_tokens": 81778351.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2977,
      "step_time": 15.520280051976442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 171.4375,
      "completions/mean_terminated_length": 171.4375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3911156952381134,
      "epoch": 0.1379342288096341,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004675261210650206,
      "kl": 0.0030932281515561044,
      "learning_rate": 9.72422417786012e-07,
      "loss": 0.0002,
      "num_tokens": 81804390.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2978,
      "step_time": 20.01363567262888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 196.6875,
      "completions/mean_terminated_length": 196.6875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.19053053855895996,
      "epoch": 0.1379805465493284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00747695891186595,
      "kl": 0.005894030095078051,
      "learning_rate": 9.72413154238073e-07,
      "loss": 0.0003,
      "num_tokens": 81836609.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2979,
      "step_time": 22.578228179365396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 170.75,
      "completions/mean_terminated_length": 170.75,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.37082868814468384,
      "epoch": 0.1380268642890227,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00807334017008543,
      "kl": 0.006216909969225526,
      "learning_rate": 9.724038906901342e-07,
      "loss": 0.0003,
      "num_tokens": 81861853.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2980,
      "step_time": 17.812676947563887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 356.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 265.9375,
      "completions/mean_terminated_length": 265.9375,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "entropy": 0.3077727183699608,
      "epoch": 0.138073182028717,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0890994518995285,
      "kl": 0.009753985330462456,
      "learning_rate": 9.723946271421955e-07,
      "loss": -0.0513,
      "num_tokens": 81884604.0,
      "reward": 0.5567809343338013,
      "reward_std": 0.4464142918586731,
      "rewards/reward_func/mean": 0.5567809343338013,
      "rewards/reward_func/std": 0.4464143216609955,
      "step": 2981,
      "step_time": 28.96948353946209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 199.0,
      "completions/mean_terminated_length": 199.0,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.40940602868795395,
      "epoch": 0.1381194997684113,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00440339557826519,
      "kl": 0.0034140886273235083,
      "learning_rate": 9.723853635942567e-07,
      "loss": 0.0002,
      "num_tokens": 81916268.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 2982,
      "step_time": 22.532951060682535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 288.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 235.4375,
      "completions/mean_terminated_length": 235.4375,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.2550181820988655,
      "epoch": 0.1381658175081056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07482445240020752,
      "kl": 0.01346679124981165,
      "learning_rate": 9.723761000463176e-07,
      "loss": -0.0532,
      "num_tokens": 81947235.0,
      "reward": 0.6716771721839905,
      "reward_std": 0.26680293679237366,
      "rewards/reward_func/mean": 0.6716771721839905,
      "rewards/reward_func/std": 0.26680296659469604,
      "step": 2983,
      "step_time": 25.834476247429848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 167.4375,
      "completions/mean_terminated_length": 167.4375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.1424405835568905,
      "epoch": 0.1382121352477999,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00439409539103508,
      "kl": 0.0021929975191596895,
      "learning_rate": 9.723668364983787e-07,
      "loss": 0.0001,
      "num_tokens": 81978522.0,
      "reward": 0.9622687101364136,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9622687101364136,
      "rewards/reward_func/std": 0.0,
      "step": 2984,
      "step_time": 20.259456109255552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 159.5625,
      "completions/mean_terminated_length": 159.5625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.19097694009542465,
      "epoch": 0.1382584529874942,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10251673310995102,
      "kl": 0.005822888575494289,
      "learning_rate": 9.7235757295044e-07,
      "loss": -0.0069,
      "num_tokens": 82001011.0,
      "reward": 0.34434574842453003,
      "reward_std": 0.038803718984127045,
      "rewards/reward_func/mean": 0.34434574842453003,
      "rewards/reward_func/std": 0.038803718984127045,
      "step": 2985,
      "step_time": 16.958806682378054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 161.75,
      "completions/mean_terminated_length": 161.75,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.33455196022987366,
      "epoch": 0.13830477072718853,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0051012057811021805,
      "kl": 0.0035558182280510664,
      "learning_rate": 9.723483094025012e-07,
      "loss": 0.0002,
      "num_tokens": 82022895.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2986,
      "step_time": 16.516713060438633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 147.75,
      "completions/mean_terminated_length": 147.75,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.22480645030736923,
      "epoch": 0.13835108846688282,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010497340932488441,
      "kl": 0.006787444464862347,
      "learning_rate": 9.723390458545623e-07,
      "loss": 0.0004,
      "num_tokens": 82047019.0,
      "reward": 0.930604875087738,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.930604875087738,
      "rewards/reward_func/std": 0.0,
      "step": 2987,
      "step_time": 18.981467500329018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 160.0,
      "completions/mean_terminated_length": 160.0,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.41501862555742264,
      "epoch": 0.13839740620657712,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006412906106561422,
      "kl": 0.0036485460004769266,
      "learning_rate": 9.723297823066234e-07,
      "loss": 0.0002,
      "num_tokens": 82069083.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2988,
      "step_time": 17.08159761875868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 179.375,
      "completions/mean_terminated_length": 179.375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.1863102950155735,
      "epoch": 0.1384437239462714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12076622247695923,
      "kl": 0.003980996320024133,
      "learning_rate": 9.723205187586845e-07,
      "loss": 0.0116,
      "num_tokens": 82091249.0,
      "reward": 0.9447779655456543,
      "reward_std": 0.01762891374528408,
      "rewards/reward_func/mean": 0.9447779655456543,
      "rewards/reward_func/std": 0.017628923058509827,
      "step": 2989,
      "step_time": 18.402724485844374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 124.625,
      "completions/mean_terminated_length": 124.625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.22548434510827065,
      "epoch": 0.13849004168596574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00368937267921865,
      "kl": 0.0017834273457992822,
      "learning_rate": 9.723112552107457e-07,
      "loss": 0.0001,
      "num_tokens": 82110651.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2990,
      "step_time": 12.86932748556137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 386.0,
      "completions/max_terminated_length": 386.0,
      "completions/mean_length": 217.3125,
      "completions/mean_terminated_length": 217.3125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.4416034296154976,
      "epoch": 0.13853635942566003,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11278366297483444,
      "kl": 0.006657724385149777,
      "learning_rate": 9.723019916628068e-07,
      "loss": -0.1936,
      "num_tokens": 82150832.0,
      "reward": 0.05823352932929993,
      "reward_std": 0.2329341173171997,
      "rewards/reward_func/mean": 0.05823352932929993,
      "rewards/reward_func/std": 0.2329341322183609,
      "step": 2991,
      "step_time": 34.99669820070267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.18767217174172401,
      "epoch": 0.13858267716535433,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13702774047851562,
      "kl": 0.005850961548276246,
      "learning_rate": 9.72292728114868e-07,
      "loss": 0.0096,
      "num_tokens": 82172589.0,
      "reward": 0.5520051717758179,
      "reward_std": 0.014887908473610878,
      "rewards/reward_func/mean": 0.5520051717758179,
      "rewards/reward_func/std": 0.014887906610965729,
      "step": 2992,
      "step_time": 16.737485133111477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 131.1875,
      "completions/mean_terminated_length": 131.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.24596243724226952,
      "epoch": 0.13862899490504862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016323782037943602,
      "kl": 0.0014825518592260778,
      "learning_rate": 9.72283464566929e-07,
      "loss": 0.0001,
      "num_tokens": 82192352.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2993,
      "step_time": 15.041008338332176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 134.125,
      "completions/mean_terminated_length": 134.125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.20328615233302116,
      "epoch": 0.13867531264474295,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00559780839830637,
      "kl": 0.00186509671038948,
      "learning_rate": 9.722742010189904e-07,
      "loss": 0.0001,
      "num_tokens": 82216498.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2994,
      "step_time": 15.585237976163626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 195.75,
      "completions/mean_terminated_length": 195.75,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.22220513597130775,
      "epoch": 0.13872163038443724,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1539018750190735,
      "kl": 0.005877788527868688,
      "learning_rate": 9.722649374710513e-07,
      "loss": 0.0223,
      "num_tokens": 82248190.0,
      "reward": 0.872715950012207,
      "reward_std": 0.23345592617988586,
      "rewards/reward_func/mean": 0.872715950012207,
      "rewards/reward_func/std": 0.23345592617988586,
      "step": 2995,
      "step_time": 23.43818784877658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 121.75,
      "completions/mean_terminated_length": 121.75,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2709006443619728,
      "epoch": 0.13876794812413154,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003966325893998146,
      "kl": 0.0022318846022244543,
      "learning_rate": 9.722556739231124e-07,
      "loss": 0.0001,
      "num_tokens": 82269994.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2996,
      "step_time": 13.280930988490582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 315.0,
      "completions/max_terminated_length": 315.0,
      "completions/mean_length": 224.4375,
      "completions/mean_terminated_length": 224.4375,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.21429625153541565,
      "epoch": 0.13881426586382584,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14449664950370789,
      "kl": 0.021888707764446735,
      "learning_rate": 9.722464103751735e-07,
      "loss": -0.086,
      "num_tokens": 82300545.0,
      "reward": 0.8610855340957642,
      "reward_std": 0.18172870576381683,
      "rewards/reward_func/mean": 0.8610855340957642,
      "rewards/reward_func/std": 0.18172870576381683,
      "step": 2997,
      "step_time": 27.55720454081893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.17517578601837158,
      "epoch": 0.13886058360352016,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09581270813941956,
      "kl": 0.0014729919785168022,
      "learning_rate": 9.722371468272349e-07,
      "loss": 0.0187,
      "num_tokens": 82322834.0,
      "reward": 0.941566526889801,
      "reward_std": 0.03496573492884636,
      "rewards/reward_func/mean": 0.941566526889801,
      "rewards/reward_func/std": 0.03496573492884636,
      "step": 2998,
      "step_time": 18.05263601243496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 138.25,
      "completions/mean_terminated_length": 138.25,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3106953948736191,
      "epoch": 0.13890690134321446,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001961939036846161,
      "kl": 0.0017858156061265618,
      "learning_rate": 9.72227883279296e-07,
      "loss": 0.0001,
      "num_tokens": 82344886.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 2999,
      "step_time": 15.224582199007273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 168.3125,
      "completions/mean_terminated_length": 168.3125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.40341317653656006,
      "epoch": 0.13895321908290875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005552309099584818,
      "kl": 0.004683843930251896,
      "learning_rate": 9.722186197313571e-07,
      "loss": 0.0002,
      "num_tokens": 82380395.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3000,
      "step_time": 21.262337151914835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 135.25,
      "completions/mean_terminated_length": 135.25,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3231126293540001,
      "epoch": 0.13899953682260305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022516052704304457,
      "kl": 0.0018037364934571087,
      "learning_rate": 9.722093561834182e-07,
      "loss": 0.0001,
      "num_tokens": 82406431.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3001,
      "step_time": 15.317341301590204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 334.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 312.4375,
      "completions/mean_terminated_length": 312.4375,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "entropy": 0.16434377059340477,
      "epoch": 0.13904585456229737,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00212790141813457,
      "kl": 0.0020398667838890105,
      "learning_rate": 9.722000926354794e-07,
      "loss": 0.0001,
      "num_tokens": 82435622.0,
      "reward": 0.9813933372497559,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9813933372497559,
      "rewards/reward_func/std": 0.0,
      "step": 3002,
      "step_time": 28.26568039879203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 170.5,
      "completions/mean_terminated_length": 170.5,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3353980928659439,
      "epoch": 0.13909217230199167,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005667801480740309,
      "kl": 0.0035694522666744888,
      "learning_rate": 9.721908290875405e-07,
      "loss": 0.0002,
      "num_tokens": 82458446.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3003,
      "step_time": 17.9529795832932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 219.375,
      "completions/mean_terminated_length": 219.375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.4704963266849518,
      "epoch": 0.13913849004168596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027271786238998175,
      "kl": 0.0027305830735713243,
      "learning_rate": 9.721815655396016e-07,
      "loss": 0.0001,
      "num_tokens": 82485172.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3004,
      "step_time": 23.76654951274395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 212.25,
      "completions/mean_terminated_length": 212.25,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.24829314649105072,
      "epoch": 0.13918480778138026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07245910912752151,
      "kl": 0.008992287330329418,
      "learning_rate": 9.721723019916627e-07,
      "loss": -0.0269,
      "num_tokens": 82509176.0,
      "reward": 0.5142320990562439,
      "reward_std": 0.07422799617052078,
      "rewards/reward_func/mean": 0.5142320990562439,
      "rewards/reward_func/std": 0.07422800362110138,
      "step": 3005,
      "step_time": 20.921358436346054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 166.9375,
      "completions/mean_terminated_length": 166.9375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3856392726302147,
      "epoch": 0.13923112552107458,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004602343309670687,
      "kl": 0.002773736574454233,
      "learning_rate": 9.721630384437239e-07,
      "loss": 0.0001,
      "num_tokens": 82541223.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3006,
      "step_time": 18.90654380246997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 177.375,
      "completions/mean_terminated_length": 177.375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.3541441410779953,
      "epoch": 0.13927744326076888,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007559177000075579,
      "kl": 0.005332320695742965,
      "learning_rate": 9.72153774895785e-07,
      "loss": 0.0003,
      "num_tokens": 82563021.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3007,
      "step_time": 18.53321072459221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 158.125,
      "completions/mean_terminated_length": 158.125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2771758511662483,
      "epoch": 0.13932376100046318,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003859475487843156,
      "kl": 0.0026886623236350715,
      "learning_rate": 9.721445113478461e-07,
      "loss": 0.0001,
      "num_tokens": 82583103.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3008,
      "step_time": 16.836896754801273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 176.6875,
      "completions/mean_terminated_length": 176.6875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.1914764828979969,
      "epoch": 0.13937007874015747,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002919860417023301,
      "kl": 0.0019065296510234475,
      "learning_rate": 9.721352477999072e-07,
      "loss": 0.0001,
      "num_tokens": 82615338.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 3009,
      "step_time": 21.103497747331858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 112.625,
      "completions/mean_terminated_length": 112.625,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.26232175529003143,
      "epoch": 0.1394163964798518,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018449926283210516,
      "kl": 0.0015014406526461244,
      "learning_rate": 9.721259842519684e-07,
      "loss": 0.0001,
      "num_tokens": 82635012.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3010,
      "step_time": 12.229396902024746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 122.5625,
      "completions/mean_terminated_length": 122.5625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3080074191093445,
      "epoch": 0.1394627142195461,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005197662394493818,
      "kl": 0.002390563109656796,
      "learning_rate": 9.721167207040297e-07,
      "loss": 0.0001,
      "num_tokens": 82657101.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3011,
      "step_time": 13.777455564588308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 167.75,
      "completions/mean_terminated_length": 167.75,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.35617558658123016,
      "epoch": 0.1395090319592404,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011955607682466507,
      "kl": 0.008283481933176517,
      "learning_rate": 9.721074571560908e-07,
      "loss": 0.0004,
      "num_tokens": 82677737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3012,
      "step_time": 17.563011325895786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 148.875,
      "completions/mean_terminated_length": 148.875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2488524466753006,
      "epoch": 0.13955534969893468,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027501913718879223,
      "kl": 0.0018833787471521646,
      "learning_rate": 9.72098193608152e-07,
      "loss": 0.0001,
      "num_tokens": 82698519.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3013,
      "step_time": 15.960894122719765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 178.4375,
      "completions/mean_terminated_length": 178.4375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.25607532262802124,
      "epoch": 0.139601667438629,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07284007221460342,
      "kl": 0.0030666259699501097,
      "learning_rate": 9.720889300602129e-07,
      "loss": -0.0101,
      "num_tokens": 82723598.0,
      "reward": 0.14382196962833405,
      "reward_std": 0.002884342335164547,
      "rewards/reward_func/mean": 0.14382196962833405,
      "rewards/reward_func/std": 0.0028843434993177652,
      "step": 3014,
      "step_time": 18.82703233882785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 170.9375,
      "completions/mean_terminated_length": 170.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3732554018497467,
      "epoch": 0.1396479851783233,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001959107117727399,
      "kl": 0.0018909156206063926,
      "learning_rate": 9.720796665122742e-07,
      "loss": 0.0001,
      "num_tokens": 82759949.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3015,
      "step_time": 20.609770573675632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 126.0,
      "completions/mean_terminated_length": 126.0,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2312699593603611,
      "epoch": 0.1396943029180176,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015609496040269732,
      "kl": 0.0013426737277768552,
      "learning_rate": 9.720704029643353e-07,
      "loss": 0.0001,
      "num_tokens": 82781757.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3016,
      "step_time": 14.741418339312077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 148.5625,
      "completions/mean_terminated_length": 148.5625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.29120461642742157,
      "epoch": 0.1397406206577119,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008427511900663376,
      "kl": 0.0023643068270757794,
      "learning_rate": 9.720611394163965e-07,
      "loss": 0.0001,
      "num_tokens": 82803526.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3017,
      "step_time": 15.820613522082567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 191.3125,
      "completions/mean_terminated_length": 191.3125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.30200475454330444,
      "epoch": 0.13978693839740622,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004149302840232849,
      "kl": 0.003819424891844392,
      "learning_rate": 9.720518758684576e-07,
      "loss": 0.0002,
      "num_tokens": 82830923.0,
      "reward": 0.2167416214942932,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.2167416214942932,
      "rewards/reward_func/std": 0.0,
      "step": 3018,
      "step_time": 20.017596885561943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 145.0,
      "completions/mean_terminated_length": 145.0,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3494086414575577,
      "epoch": 0.13983325613710051,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022782550659030676,
      "kl": 0.001864958874648437,
      "learning_rate": 9.720426123205187e-07,
      "loss": 0.0001,
      "num_tokens": 82857579.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3019,
      "step_time": 16.663807447999716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 191.8125,
      "completions/mean_terminated_length": 191.8125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.35616112500429153,
      "epoch": 0.1398795738767948,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08675806224346161,
      "kl": 0.0035336354630999267,
      "learning_rate": 9.720333487725798e-07,
      "loss": -0.0007,
      "num_tokens": 82887640.0,
      "reward": 0.8100361824035645,
      "reward_std": 0.31683072447776794,
      "rewards/reward_func/mean": 0.8100361824035645,
      "rewards/reward_func/std": 0.31683069467544556,
      "step": 3020,
      "step_time": 23.310782480984926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 175.75,
      "completions/mean_terminated_length": 175.75,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.4156198650598526,
      "epoch": 0.1399258916164891,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019909366965293884,
      "kl": 0.002192864805692807,
      "learning_rate": 9.72024085224641e-07,
      "loss": 0.0001,
      "num_tokens": 82942036.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3021,
      "step_time": 26.098113026469946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 116.875,
      "completions/mean_terminated_length": 116.875,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2946057990193367,
      "epoch": 0.13997220935618343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006412202958017588,
      "kl": 0.0036119677824899554,
      "learning_rate": 9.72014821676702e-07,
      "loss": 0.0002,
      "num_tokens": 82963410.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3022,
      "step_time": 13.971223030239344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 153.75,
      "completions/mean_terminated_length": 153.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2157832831144333,
      "epoch": 0.14001852709587773,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002061971463263035,
      "kl": 0.0014133761869743466,
      "learning_rate": 9.720055581287632e-07,
      "loss": 0.0001,
      "num_tokens": 82984206.0,
      "reward": 0.26742759346961975,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.26742759346961975,
      "rewards/reward_func/std": 0.0,
      "step": 3023,
      "step_time": 15.735854860395193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 197.1875,
      "completions/mean_terminated_length": 197.1875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4383648708462715,
      "epoch": 0.14006484483557202,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005425654351711273,
      "kl": 0.003583900397643447,
      "learning_rate": 9.719962945808243e-07,
      "loss": 0.0002,
      "num_tokens": 83018753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3024,
      "step_time": 23.91798797994852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 168.25,
      "completions/mean_terminated_length": 168.25,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.39416050910949707,
      "epoch": 0.14011116257526632,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002842653775587678,
      "kl": 0.0023979249817784876,
      "learning_rate": 9.719870310328857e-07,
      "loss": 0.0001,
      "num_tokens": 83052597.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3025,
      "step_time": 19.870104629546404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 283.0,
      "completions/max_terminated_length": 283.0,
      "completions/mean_length": 220.6875,
      "completions/mean_terminated_length": 220.6875,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.21130745857954025,
      "epoch": 0.14015748031496064,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11111432313919067,
      "kl": 0.0033099165884777904,
      "learning_rate": 9.719777674849466e-07,
      "loss": 0.0688,
      "num_tokens": 83075936.0,
      "reward": 0.9771252870559692,
      "reward_std": 0.08604231476783752,
      "rewards/reward_func/mean": 0.9771252870559692,
      "rewards/reward_func/std": 0.08604232966899872,
      "step": 3026,
      "step_time": 24.006690483540297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 146.0,
      "completions/mean_terminated_length": 146.0,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.15849712491035461,
      "epoch": 0.14020379805465494,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.053610339760780334,
      "kl": 0.0011202768073417246,
      "learning_rate": 9.719685039370077e-07,
      "loss": 0.0001,
      "num_tokens": 83112480.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3027,
      "step_time": 18.192413430660963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 184.875,
      "completions/mean_terminated_length": 184.875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.44311244040727615,
      "epoch": 0.14025011579434923,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037516490556299686,
      "kl": 0.00294655334437266,
      "learning_rate": 9.71959240389069e-07,
      "loss": 0.0001,
      "num_tokens": 83136014.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3028,
      "step_time": 20.209473069757223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 365.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 261.375,
      "completions/mean_terminated_length": 261.375,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.5869688093662262,
      "epoch": 0.14029643353404353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10678769648075104,
      "kl": 0.003234754258301109,
      "learning_rate": 9.719499768411302e-07,
      "loss": 0.0309,
      "num_tokens": 83166564.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 3029,
      "step_time": 30.887698356062174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 131.8125,
      "completions/mean_terminated_length": 131.8125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3739954084157944,
      "epoch": 0.14034275127373785,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022352435626089573,
      "kl": 0.0020131257479079068,
      "learning_rate": 9.719407132931913e-07,
      "loss": 0.0001,
      "num_tokens": 83193073.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3030,
      "step_time": 15.910360716283321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 119.875,
      "completions/mean_terminated_length": 119.875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.27550504356622696,
      "epoch": 0.14038906901343215,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017742261989042163,
      "kl": 0.0014804693055339158,
      "learning_rate": 9.719314497452524e-07,
      "loss": 0.0001,
      "num_tokens": 83213823.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3031,
      "step_time": 13.649042718112469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 148.4375,
      "completions/mean_terminated_length": 148.4375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.4004627615213394,
      "epoch": 0.14043538675312645,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002752473810687661,
      "kl": 0.002842024026904255,
      "learning_rate": 9.719221861973135e-07,
      "loss": 0.0001,
      "num_tokens": 83266678.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3032,
      "step_time": 23.976348515599966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 180.75,
      "completions/mean_terminated_length": 180.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2226003035902977,
      "epoch": 0.14048170449282074,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23777858912944794,
      "kl": 0.022119770292192698,
      "learning_rate": 9.719129226493747e-07,
      "loss": 0.0544,
      "num_tokens": 83289586.0,
      "reward": 0.8626243472099304,
      "reward_std": 0.2472572922706604,
      "rewards/reward_func/mean": 0.8626243472099304,
      "rewards/reward_func/std": 0.2472572922706604,
      "step": 3033,
      "step_time": 20.435957849025726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 141.5,
      "completions/mean_terminated_length": 141.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.30680377781391144,
      "epoch": 0.14052802223251507,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008545816875994205,
      "kl": 0.0029445297259371728,
      "learning_rate": 9.719036591014358e-07,
      "loss": 0.0001,
      "num_tokens": 83318282.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3034,
      "step_time": 16.973541107028723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3295881971716881,
      "epoch": 0.14057433997220936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.017239896580576897,
      "kl": 0.010237524285912514,
      "learning_rate": 9.71894395553497e-07,
      "loss": 0.0005,
      "num_tokens": 83340489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3035,
      "step_time": 17.23481447249651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 178.5625,
      "completions/mean_terminated_length": 178.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.35540928691625595,
      "epoch": 0.14062065771190366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006867921911180019,
      "kl": 0.004615910118445754,
      "learning_rate": 9.71885132005558e-07,
      "loss": 0.0002,
      "num_tokens": 83372994.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3036,
      "step_time": 21.024396255612373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 183.625,
      "completions/mean_terminated_length": 183.625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.172479510307312,
      "epoch": 0.14066697545159795,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004548189230263233,
      "kl": 0.004860262502916157,
      "learning_rate": 9.718758684576192e-07,
      "loss": 0.0002,
      "num_tokens": 83398716.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3037,
      "step_time": 19.449104487895966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 197.375,
      "completions/mean_terminated_length": 197.375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.3322830870747566,
      "epoch": 0.14071329319129228,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003146246075630188,
      "kl": 0.002210581151302904,
      "learning_rate": 9.718666049096803e-07,
      "loss": 0.0001,
      "num_tokens": 83426546.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3038,
      "step_time": 21.201699301600456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 137.875,
      "completions/mean_terminated_length": 137.875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.25694333761930466,
      "epoch": 0.14075961093098657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008664743974804878,
      "kl": 0.0029743796912953258,
      "learning_rate": 9.718573413617414e-07,
      "loss": 0.0001,
      "num_tokens": 83446128.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3039,
      "step_time": 15.962315011769533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 186.4375,
      "completions/mean_terminated_length": 186.4375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.38639630377292633,
      "epoch": 0.14080592867068087,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14186641573905945,
      "kl": 0.005497544887475669,
      "learning_rate": 9.718480778138025e-07,
      "loss": 0.0188,
      "num_tokens": 83482887.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 3040,
      "step_time": 22.51087235286832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 120.8125,
      "completions/mean_terminated_length": 120.8125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.22201518341898918,
      "epoch": 0.14085224641037516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010322537273168564,
      "kl": 0.0010378068109275773,
      "learning_rate": 9.718388142658639e-07,
      "loss": 0.0001,
      "num_tokens": 83507380.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3041,
      "step_time": 14.335064977407455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 165.0625,
      "completions/mean_terminated_length": 165.0625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.20901744812726974,
      "epoch": 0.1408985641500695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010990627342835069,
      "kl": 0.0010493967711227015,
      "learning_rate": 9.71829550717925e-07,
      "loss": 0.0001,
      "num_tokens": 83553189.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3042,
      "step_time": 23.672550708055496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 195.375,
      "completions/mean_terminated_length": 195.375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.26086245849728584,
      "epoch": 0.14094488188976378,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10829035937786102,
      "kl": 0.002545473660575226,
      "learning_rate": 9.718202871699861e-07,
      "loss": -0.0074,
      "num_tokens": 83589339.0,
      "reward": 0.8403646945953369,
      "reward_std": 0.14294101297855377,
      "rewards/reward_func/mean": 0.8403646945953369,
      "rewards/reward_func/std": 0.14294102787971497,
      "step": 3043,
      "step_time": 22.47973906993866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 207.875,
      "completions/mean_terminated_length": 207.875,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.24207553267478943,
      "epoch": 0.14099119962945808,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09628209471702576,
      "kl": 0.010839248541742563,
      "learning_rate": 9.718110236220473e-07,
      "loss": -0.0258,
      "num_tokens": 83618537.0,
      "reward": 0.8728713393211365,
      "reward_std": 0.23276567459106445,
      "rewards/reward_func/mean": 0.8728713393211365,
      "rewards/reward_func/std": 0.23276568949222565,
      "step": 3044,
      "step_time": 22.380034614354372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 115.625,
      "completions/mean_terminated_length": 115.625,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.24875660985708237,
      "epoch": 0.14103751736915238,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031943044159561396,
      "kl": 0.0018857441318687052,
      "learning_rate": 9.718017600741084e-07,
      "loss": 0.0001,
      "num_tokens": 83638483.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3045,
      "step_time": 13.065401766449213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 193.5,
      "completions/mean_terminated_length": 193.5,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.32689883559942245,
      "epoch": 0.1410838351088467,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11655188351869583,
      "kl": 0.008323002490215003,
      "learning_rate": 9.717924965261695e-07,
      "loss": -0.0041,
      "num_tokens": 83661307.0,
      "reward": 0.829888105392456,
      "reward_std": 0.2213035225868225,
      "rewards/reward_func/mean": 0.829888105392456,
      "rewards/reward_func/std": 0.22130350768566132,
      "step": 3046,
      "step_time": 21.887813713401556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 201.8125,
      "completions/mean_terminated_length": 201.8125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.4084246978163719,
      "epoch": 0.141130152848541,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004622376523911953,
      "kl": 0.003465540474280715,
      "learning_rate": 9.717832329782306e-07,
      "loss": 0.0002,
      "num_tokens": 83689064.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3047,
      "step_time": 22.28806211799383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 195.625,
      "completions/mean_terminated_length": 195.625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.191276665776968,
      "epoch": 0.1411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10191956162452698,
      "kl": 0.0030786641291342676,
      "learning_rate": 9.717739694302918e-07,
      "loss": -0.0201,
      "num_tokens": 83723202.0,
      "reward": 0.16149914264678955,
      "reward_std": 0.12526051700115204,
      "rewards/reward_func/mean": 0.16149914264678955,
      "rewards/reward_func/std": 0.12526051700115204,
      "step": 3048,
      "step_time": 23.00204337015748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 125.0625,
      "completions/mean_terminated_length": 125.0625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.28970544785261154,
      "epoch": 0.1412227883279296,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031022813636809587,
      "kl": 0.0023523358104284853,
      "learning_rate": 9.717647058823529e-07,
      "loss": 0.0001,
      "num_tokens": 83742963.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3049,
      "step_time": 14.628591068089008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 129.1875,
      "completions/mean_terminated_length": 129.1875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.23548217117786407,
      "epoch": 0.1412691060676239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013360042357817292,
      "kl": 0.0010569237201707438,
      "learning_rate": 9.71755442334414e-07,
      "loss": 0.0001,
      "num_tokens": 83770214.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3050,
      "step_time": 15.252948425710201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 161.4375,
      "completions/mean_terminated_length": 161.4375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.44899706542491913,
      "epoch": 0.1413154238073182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018619222100824118,
      "kl": 0.0023352773278020322,
      "learning_rate": 9.717461787864751e-07,
      "loss": 0.0001,
      "num_tokens": 83825725.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3051,
      "step_time": 25.797151293605566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 141.0,
      "completions/mean_terminated_length": 141.0,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3013594150543213,
      "epoch": 0.1413617415470125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021030872594565153,
      "kl": 0.001682311820331961,
      "learning_rate": 9.717369152385363e-07,
      "loss": 0.0001,
      "num_tokens": 83847101.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3052,
      "step_time": 16.435410499572754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2900470197200775,
      "epoch": 0.1414080592867068,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014194217510521412,
      "kl": 0.0037966840609442443,
      "learning_rate": 9.717276516905974e-07,
      "loss": 0.0002,
      "num_tokens": 83867606.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3053,
      "step_time": 14.13415927067399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 335.0,
      "completions/max_terminated_length": 335.0,
      "completions/mean_length": 295.5,
      "completions/mean_terminated_length": 295.5,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "entropy": 0.222489595413208,
      "epoch": 0.14145437702640112,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033659781329333782,
      "kl": 0.002463035110849887,
      "learning_rate": 9.717183881426585e-07,
      "loss": 0.0001,
      "num_tokens": 83908078.0,
      "reward": 0.8346666097640991,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8346666097640991,
      "rewards/reward_func/std": 0.0,
      "step": 3054,
      "step_time": 31.249343916773796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 191.75,
      "completions/mean_terminated_length": 191.75,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.21848269924521446,
      "epoch": 0.14150069476609542,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11221415549516678,
      "kl": 0.0034620476653799415,
      "learning_rate": 9.717091245947198e-07,
      "loss": -0.0784,
      "num_tokens": 83943114.0,
      "reward": 0.34633660316467285,
      "reward_std": 0.014082971960306168,
      "rewards/reward_func/mean": 0.34633660316467285,
      "rewards/reward_func/std": 0.014082977548241615,
      "step": 3055,
      "step_time": 23.131091088056564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 136.0,
      "completions/mean_terminated_length": 136.0,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2509133219718933,
      "epoch": 0.14154701250578972,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017673317342996597,
      "kl": 0.0015014345990493894,
      "learning_rate": 9.71699861046781e-07,
      "loss": 0.0001,
      "num_tokens": 83964682.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3056,
      "step_time": 14.708979427814484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 181.9375,
      "completions/mean_terminated_length": 181.9375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.22735528647899628,
      "epoch": 0.141593330245484,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13249163329601288,
      "kl": 0.003153125464450568,
      "learning_rate": 9.716905974988419e-07,
      "loss": 0.0244,
      "num_tokens": 84017945.0,
      "reward": 0.9714365601539612,
      "reward_std": 0.02950017899274826,
      "rewards/reward_func/mean": 0.9714365601539612,
      "rewards/reward_func/std": 0.029500195756554604,
      "step": 3057,
      "step_time": 28.813837237656116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 143.0,
      "completions/mean_terminated_length": 143.0,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2536050006747246,
      "epoch": 0.14163964798517834,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004969343543052673,
      "kl": 0.003967517986893654,
      "learning_rate": 9.716813339509032e-07,
      "loss": 0.0002,
      "num_tokens": 84038057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3058,
      "step_time": 14.99054791033268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 172.5,
      "completions/mean_terminated_length": 172.5,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.4103945419192314,
      "epoch": 0.14168596572487263,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022730007767677307,
      "kl": 0.002340605657082051,
      "learning_rate": 9.716720704029643e-07,
      "loss": 0.0001,
      "num_tokens": 84070049.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3059,
      "step_time": 20.31228854879737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 201.0625,
      "completions/mean_terminated_length": 201.0625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.2669229544699192,
      "epoch": 0.14173228346456693,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00418067304417491,
      "kl": 0.003562486555892974,
      "learning_rate": 9.716628068550255e-07,
      "loss": 0.0002,
      "num_tokens": 84093106.0,
      "reward": 0.9672160744667053,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9672160744667053,
      "rewards/reward_func/std": 0.0,
      "step": 3060,
      "step_time": 22.047264583408833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 155.1875,
      "completions/mean_terminated_length": 155.1875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.36747582256793976,
      "epoch": 0.14177860120426122,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002079027472063899,
      "kl": 0.002176420297473669,
      "learning_rate": 9.716535433070866e-07,
      "loss": 0.0001,
      "num_tokens": 84152261.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3061,
      "step_time": 26.792435012757778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 249.125,
      "completions/mean_terminated_length": 249.125,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "entropy": 0.2686142325401306,
      "epoch": 0.14182491894395555,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10157947987318039,
      "kl": 0.0017788418917916715,
      "learning_rate": 9.716442797591477e-07,
      "loss": -0.0459,
      "num_tokens": 84191575.0,
      "reward": 0.12028113752603531,
      "reward_std": 0.07132308930158615,
      "rewards/reward_func/mean": 0.12028113752603531,
      "rewards/reward_func/std": 0.07132309675216675,
      "step": 3062,
      "step_time": 29.01181998103857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.22781230881810188,
      "epoch": 0.14187123668364984,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10221483558416367,
      "kl": 0.0018892473890446126,
      "learning_rate": 9.716350162112088e-07,
      "loss": 0.0533,
      "num_tokens": 84217301.0,
      "reward": 0.699444055557251,
      "reward_std": 0.07328739017248154,
      "rewards/reward_func/mean": 0.699444055557251,
      "rewards/reward_func/std": 0.07328739762306213,
      "step": 3063,
      "step_time": 19.34302917867899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.42125069350004196,
      "epoch": 0.14191755442334414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018473091768100858,
      "kl": 0.001721715903840959,
      "learning_rate": 9.7162575266327e-07,
      "loss": 0.0001,
      "num_tokens": 84250935.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3064,
      "step_time": 20.535016171634197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 198.0625,
      "completions/mean_terminated_length": 198.0625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3379729837179184,
      "epoch": 0.14196387216303843,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10586327314376831,
      "kl": 0.0037930503604002297,
      "learning_rate": 9.71616489115331e-07,
      "loss": -0.0289,
      "num_tokens": 84288568.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3065,
      "step_time": 24.85549681261182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 151.125,
      "completions/mean_terminated_length": 151.125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4226868078112602,
      "epoch": 0.14201018990273276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013798902509734035,
      "kl": 0.00155748357065022,
      "learning_rate": 9.716072255673922e-07,
      "loss": 0.0001,
      "num_tokens": 84323210.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3066,
      "step_time": 19.625557269901037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 140.9375,
      "completions/mean_terminated_length": 140.9375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.34594182670116425,
      "epoch": 0.14205650764242705,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011119349859654903,
      "kl": 0.00644524663221091,
      "learning_rate": 9.715979620194533e-07,
      "loss": 0.0003,
      "num_tokens": 84343801.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3067,
      "step_time": 14.647478591650724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 155.125,
      "completions/mean_terminated_length": 155.125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.39974554628133774,
      "epoch": 0.14210282538212135,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002050853567197919,
      "kl": 0.0018190699338447303,
      "learning_rate": 9.715886984715147e-07,
      "loss": 0.0001,
      "num_tokens": 84377019.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3068,
      "step_time": 20.185601744800806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 132.6875,
      "completions/mean_terminated_length": 132.6875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.27893398702144623,
      "epoch": 0.14214914312181565,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19752110540866852,
      "kl": 0.005182704247999936,
      "learning_rate": 9.715794349235756e-07,
      "loss": -0.1375,
      "num_tokens": 84397590.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3069,
      "step_time": 16.552004102617502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 167.1875,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.14356551691889763,
      "epoch": 0.14219546086150997,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001284717465750873,
      "kl": 0.000879270868608728,
      "learning_rate": 9.715701713756367e-07,
      "loss": 0.0,
      "num_tokens": 84426473.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 3070,
      "step_time": 18.244757778942585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 157.5625,
      "completions/mean_terminated_length": 157.5625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.4129192754626274,
      "epoch": 0.14224177860120427,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00182828470133245,
      "kl": 0.002239855588413775,
      "learning_rate": 9.71560907827698e-07,
      "loss": 0.0001,
      "num_tokens": 84477298.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3071,
      "step_time": 23.213310711085796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 246.0,
      "completions/mean_terminated_length": 246.0,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.4551624208688736,
      "epoch": 0.14228809634089856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07563548535108566,
      "kl": 0.002163757977541536,
      "learning_rate": 9.715516442797592e-07,
      "loss": 0.0621,
      "num_tokens": 84506498.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3072,
      "step_time": 26.718425411731005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 167.0,
      "completions/mean_terminated_length": 167.0,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.27576783671975136,
      "epoch": 0.14233441408059286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09983588755130768,
      "kl": 0.001499654186773114,
      "learning_rate": 9.715423807318203e-07,
      "loss": 0.0869,
      "num_tokens": 84528866.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3073,
      "step_time": 21.365447714924812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 173.3125,
      "completions/mean_terminated_length": 173.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.1853836365044117,
      "epoch": 0.14238073182028718,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004038714803755283,
      "kl": 0.0018410694028716534,
      "learning_rate": 9.715331171838814e-07,
      "loss": 0.0001,
      "num_tokens": 84567847.0,
      "reward": 0.8507331609725952,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8507331609725952,
      "rewards/reward_func/std": 0.0,
      "step": 3074,
      "step_time": 22.511045575141907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 129.75,
      "completions/mean_terminated_length": 129.75,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.35022301971912384,
      "epoch": 0.14242704955998148,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002216340508311987,
      "kl": 0.0017985946033149958,
      "learning_rate": 9.715238536359426e-07,
      "loss": 0.0001,
      "num_tokens": 84592435.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3075,
      "step_time": 14.913249608129263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.2580137923359871,
      "epoch": 0.14247336729967577,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11763420701026917,
      "kl": 0.004678957397118211,
      "learning_rate": 9.715145900880037e-07,
      "loss": -0.0676,
      "num_tokens": 84612897.0,
      "reward": 0.7995544672012329,
      "reward_std": 0.051335595548152924,
      "rewards/reward_func/mean": 0.7995544672012329,
      "rewards/reward_func/std": 0.05133558437228203,
      "step": 3076,
      "step_time": 19.429770436137915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 135.0,
      "completions/mean_terminated_length": 135.0,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2920405864715576,
      "epoch": 0.14251968503937007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034133740700781345,
      "kl": 0.0020403833768796176,
      "learning_rate": 9.715053265400648e-07,
      "loss": 0.0001,
      "num_tokens": 84649185.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3077,
      "step_time": 18.52658887952566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.28933966904878616,
      "epoch": 0.1425660027790644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011805701069533825,
      "kl": 0.0012496154668042436,
      "learning_rate": 9.71496062992126e-07,
      "loss": 0.0001,
      "num_tokens": 84683962.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3078,
      "step_time": 17.53727101162076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 200.6875,
      "completions/mean_terminated_length": 200.6875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.46959687769412994,
      "epoch": 0.1426123205187587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004442025441676378,
      "kl": 0.0035442839143797755,
      "learning_rate": 9.71486799444187e-07,
      "loss": 0.0002,
      "num_tokens": 84708981.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3079,
      "step_time": 21.380993772298098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 143.875,
      "completions/mean_terminated_length": 143.875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3721555396914482,
      "epoch": 0.14265863825845299,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020447385031729937,
      "kl": 0.001785428379662335,
      "learning_rate": 9.714775358962482e-07,
      "loss": 0.0001,
      "num_tokens": 84733059.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3080,
      "step_time": 16.2989693954587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 153.3125,
      "completions/mean_terminated_length": 153.3125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.1326531060039997,
      "epoch": 0.14270495599814728,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12060035765171051,
      "kl": 0.0008723430219106376,
      "learning_rate": 9.714682723483093e-07,
      "loss": -0.0011,
      "num_tokens": 84767000.0,
      "reward": 0.9655313491821289,
      "reward_std": 0.03559904173016548,
      "rewards/reward_func/mean": 0.9655313491821289,
      "rewards/reward_func/std": 0.03559904173016548,
      "step": 3081,
      "step_time": 18.66436466574669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 151.8125,
      "completions/mean_terminated_length": 151.8125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.18006207048892975,
      "epoch": 0.1427512737378416,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012774410424754024,
      "kl": 0.0011981050483882427,
      "learning_rate": 9.714590088003704e-07,
      "loss": 0.0001,
      "num_tokens": 84787637.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 3082,
      "step_time": 15.84832838922739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 179.5625,
      "completions/mean_terminated_length": 179.5625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.37587346881628036,
      "epoch": 0.1427975914775359,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16464351117610931,
      "kl": 0.011520389234647155,
      "learning_rate": 9.714497452524315e-07,
      "loss": -0.0793,
      "num_tokens": 84809838.0,
      "reward": 0.3690481185913086,
      "reward_std": 0.49216657876968384,
      "rewards/reward_func/mean": 0.3690481185913086,
      "rewards/reward_func/std": 0.49216654896736145,
      "step": 3083,
      "step_time": 19.75730662792921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 170.25,
      "completions/mean_terminated_length": 170.25,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4154827445745468,
      "epoch": 0.1428439092172302,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00961045641452074,
      "kl": 0.007004943443462253,
      "learning_rate": 9.714404817044927e-07,
      "loss": 0.0003,
      "num_tokens": 84831442.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3084,
      "step_time": 17.125869277864695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 175.875,
      "completions/mean_terminated_length": 175.875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.2482575811445713,
      "epoch": 0.1428902269569245,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09998895227909088,
      "kl": 0.005964857351500541,
      "learning_rate": 9.71431218156554e-07,
      "loss": 0.0462,
      "num_tokens": 84852048.0,
      "reward": 0.7368773818016052,
      "reward_std": 0.3585629165172577,
      "rewards/reward_func/mean": 0.7368773818016052,
      "rewards/reward_func/std": 0.3585629463195801,
      "step": 3085,
      "step_time": 18.564539909362793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 269.0,
      "completions/max_terminated_length": 269.0,
      "completions/mean_length": 220.9375,
      "completions/mean_terminated_length": 220.9375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.38869137316942215,
      "epoch": 0.14293654469661882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32799142599105835,
      "kl": 0.0051233284175395966,
      "learning_rate": 9.714219546086151e-07,
      "loss": 0.0081,
      "num_tokens": 84882895.0,
      "reward": 0.007592645939439535,
      "reward_std": 0.03037058375775814,
      "rewards/reward_func/mean": 0.007592645939439535,
      "rewards/reward_func/std": 0.03037058375775814,
      "step": 3086,
      "step_time": 24.329820852726698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.2955872640013695,
      "epoch": 0.1429828624363131,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034672317560762167,
      "kl": 0.002140417287591845,
      "learning_rate": 9.714126910606763e-07,
      "loss": 0.0001,
      "num_tokens": 84904311.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3087,
      "step_time": 15.493290606886148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 128.5,
      "completions/mean_terminated_length": 128.5,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.35423120111227036,
      "epoch": 0.1430291801760074,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031535944435745478,
      "kl": 0.0023835660540498793,
      "learning_rate": 9.714034275127374e-07,
      "loss": 0.0001,
      "num_tokens": 84927423.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3088,
      "step_time": 15.894897159188986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.17939773947000504,
      "epoch": 0.1430754979157017,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07509777694940567,
      "kl": 0.002948031004052609,
      "learning_rate": 9.713941639647985e-07,
      "loss": -0.0051,
      "num_tokens": 84973399.0,
      "reward": 0.9079843759536743,
      "reward_std": 0.0359191857278347,
      "rewards/reward_func/mean": 0.9079843759536743,
      "rewards/reward_func/std": 0.035919204354286194,
      "step": 3089,
      "step_time": 23.080620639026165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 242.5,
      "completions/mean_terminated_length": 242.5,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "entropy": 0.38066281378269196,
      "epoch": 0.14312181565539603,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09729088097810745,
      "kl": 0.004983256570994854,
      "learning_rate": 9.713849004168596e-07,
      "loss": -0.0478,
      "num_tokens": 85011839.0,
      "reward": 0.13014303147792816,
      "reward_std": 0.14395686984062195,
      "rewards/reward_func/mean": 0.13014303147792816,
      "rewards/reward_func/std": 0.14395686984062195,
      "step": 3090,
      "step_time": 27.270726181566715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 184.125,
      "completions/mean_terminated_length": 184.125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3863908275961876,
      "epoch": 0.14316813339509032,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006155778653919697,
      "kl": 0.0033418061793781817,
      "learning_rate": 9.713756368689208e-07,
      "loss": 0.0002,
      "num_tokens": 85048849.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3091,
      "step_time": 23.163608994334936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 207.125,
      "completions/mean_terminated_length": 207.125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2691080644726753,
      "epoch": 0.14321445113478462,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003819690551608801,
      "kl": 0.003386061522178352,
      "learning_rate": 9.713663733209819e-07,
      "loss": 0.0002,
      "num_tokens": 85087027.0,
      "reward": 0.23965103924274445,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.23965103924274445,
      "rewards/reward_func/std": 0.0,
      "step": 3092,
      "step_time": 24.239079508930445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 197.0625,
      "completions/mean_terminated_length": 197.0625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.3097037822008133,
      "epoch": 0.14326076887447892,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09709495306015015,
      "kl": 0.0023074370983522385,
      "learning_rate": 9.71357109773043e-07,
      "loss": -0.0284,
      "num_tokens": 85119972.0,
      "reward": 0.8820197582244873,
      "reward_std": 0.23685505986213684,
      "rewards/reward_func/mean": 0.8820197582244873,
      "rewards/reward_func/std": 0.23685505986213684,
      "step": 3093,
      "step_time": 22.47984228283167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 184.375,
      "completions/mean_terminated_length": 184.375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.17485283315181732,
      "epoch": 0.14330708661417324,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019301557913422585,
      "kl": 0.0016929760458879173,
      "learning_rate": 9.713478462251041e-07,
      "loss": 0.0001,
      "num_tokens": 85157402.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3094,
      "step_time": 21.515938695520163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 157.8125,
      "completions/mean_terminated_length": 157.8125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.15712221339344978,
      "epoch": 0.14335340435386754,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015463732415810227,
      "kl": 0.0010701666760724038,
      "learning_rate": 9.713385826771653e-07,
      "loss": 0.0001,
      "num_tokens": 85178679.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3095,
      "step_time": 15.812082946300507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 308.0,
      "completions/max_terminated_length": 308.0,
      "completions/mean_length": 216.8125,
      "completions/mean_terminated_length": 216.8125,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.4117165356874466,
      "epoch": 0.14339972209356183,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10295020788908005,
      "kl": 0.006531993625685573,
      "learning_rate": 9.713293191292264e-07,
      "loss": -0.052,
      "num_tokens": 85208132.0,
      "reward": 0.43998464941978455,
      "reward_std": 0.262356162071228,
      "rewards/reward_func/mean": 0.43998464941978455,
      "rewards/reward_func/std": 0.2623561918735504,
      "step": 3096,
      "step_time": 26.689372658729553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 322.1875,
      "completions/mean_terminated_length": 322.1875,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.21018989011645317,
      "epoch": 0.14344603983325613,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07340876758098602,
      "kl": 0.006146425555925816,
      "learning_rate": 9.713200555812875e-07,
      "loss": -0.1871,
      "num_tokens": 85242695.0,
      "reward": 0.6335451006889343,
      "reward_std": 0.3770582377910614,
      "rewards/reward_func/mean": 0.6335451006889343,
      "rewards/reward_func/std": 0.3770582675933838,
      "step": 3097,
      "step_time": 34.40815368667245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 145.4375,
      "completions/mean_terminated_length": 145.4375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.18927786126732826,
      "epoch": 0.14349235757295045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002579091116786003,
      "kl": 0.0013895122101530433,
      "learning_rate": 9.713107920333488e-07,
      "loss": 0.0001,
      "num_tokens": 85267214.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 3098,
      "step_time": 16.404458358883858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 170.0,
      "completions/mean_terminated_length": 170.0,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3593042865395546,
      "epoch": 0.14353867531264475,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004546928219497204,
      "kl": 0.003188308735843748,
      "learning_rate": 9.7130152848541e-07,
      "loss": 0.0002,
      "num_tokens": 85296462.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3099,
      "step_time": 18.903316736221313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 185.0,
      "completions/mean_terminated_length": 185.0,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.35782644152641296,
      "epoch": 0.14358499305233904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018865115707740188,
      "kl": 0.00179413627483882,
      "learning_rate": 9.712922649374709e-07,
      "loss": 0.0001,
      "num_tokens": 85336446.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3100,
      "step_time": 22.833679974079132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 158.625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.1481158770620823,
      "epoch": 0.14363131079203334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017669608350843191,
      "kl": 0.0011966716556344181,
      "learning_rate": 9.712830013895322e-07,
      "loss": 0.0001,
      "num_tokens": 85361608.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 3101,
      "step_time": 17.195040185004473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 214.6875,
      "completions/mean_terminated_length": 214.6875,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.25260768085718155,
      "epoch": 0.14367762853172766,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06848274916410446,
      "kl": 0.002867669682018459,
      "learning_rate": 9.712737378415933e-07,
      "loss": -0.006,
      "num_tokens": 85386307.0,
      "reward": 0.1186196506023407,
      "reward_std": 0.0013871828559786081,
      "rewards/reward_func/mean": 0.1186196506023407,
      "rewards/reward_func/std": 0.0013871807605028152,
      "step": 3102,
      "step_time": 24.05906977877021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 156.75,
      "completions/mean_terminated_length": 156.75,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3555886596441269,
      "epoch": 0.14372394627142196,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028709163889288902,
      "kl": 0.002633381634950638,
      "learning_rate": 9.712644742936545e-07,
      "loss": 0.0001,
      "num_tokens": 85415695.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3103,
      "step_time": 18.29762477427721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 110.9375,
      "completions/mean_terminated_length": 110.9375,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2514931708574295,
      "epoch": 0.14377026401111626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002546251518651843,
      "kl": 0.0020225835614837706,
      "learning_rate": 9.712552107457156e-07,
      "loss": 0.0001,
      "num_tokens": 85436702.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3104,
      "step_time": 12.752374984323978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 196.6875,
      "completions/mean_terminated_length": 196.6875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3918505311012268,
      "epoch": 0.14381658175081055,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12146206200122833,
      "kl": 0.005598803400062025,
      "learning_rate": 9.712459471977767e-07,
      "loss": -0.0384,
      "num_tokens": 85460953.0,
      "reward": 0.0003962897462770343,
      "reward_std": 0.0015851589851081371,
      "rewards/reward_func/mean": 0.0003962897462770343,
      "rewards/reward_func/std": 0.0015851589851081371,
      "step": 3105,
      "step_time": 21.550162710249424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 199.6875,
      "completions/mean_terminated_length": 199.6875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.19410334154963493,
      "epoch": 0.14386289949050488,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10892891883850098,
      "kl": 0.002881001215428114,
      "learning_rate": 9.712366836498378e-07,
      "loss": 0.0104,
      "num_tokens": 85486452.0,
      "reward": 0.9705920815467834,
      "reward_std": 0.017535503953695297,
      "rewards/reward_func/mean": 0.9705920815467834,
      "rewards/reward_func/std": 0.0175354965031147,
      "step": 3106,
      "step_time": 20.669698297977448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 235.8125,
      "completions/mean_terminated_length": 235.8125,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "entropy": 0.21750912070274353,
      "epoch": 0.14390921723019917,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11343816667795181,
      "kl": 0.005503826774656773,
      "learning_rate": 9.71227420101899e-07,
      "loss": -0.0276,
      "num_tokens": 85509697.0,
      "reward": 0.9566006064414978,
      "reward_std": 0.014243211597204208,
      "rewards/reward_func/mean": 0.9566006064414978,
      "rewards/reward_func/std": 0.014243212528526783,
      "step": 3107,
      "step_time": 22.1902144998312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 148.8125,
      "completions/mean_terminated_length": 148.8125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.17570658028125763,
      "epoch": 0.14395553496989347,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018220599740743637,
      "kl": 0.001269097119802609,
      "learning_rate": 9.7121815655396e-07,
      "loss": 0.0001,
      "num_tokens": 85532702.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 3108,
      "step_time": 16.33903457224369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 181.1875,
      "completions/mean_terminated_length": 181.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.23641077429056168,
      "epoch": 0.14400185270958776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002284695627167821,
      "kl": 0.0024181325279641896,
      "learning_rate": 9.712088930060212e-07,
      "loss": 0.0001,
      "num_tokens": 85557809.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 3109,
      "step_time": 20.062915228307247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 161.8125,
      "completions/mean_terminated_length": 161.8125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3653111681342125,
      "epoch": 0.1440481704492821,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038684229366481304,
      "kl": 0.00288474268745631,
      "learning_rate": 9.711996294580823e-07,
      "loss": 0.0001,
      "num_tokens": 85582062.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3110,
      "step_time": 17.6112901866436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 357.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 259.8125,
      "completions/mean_terminated_length": 259.8125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.48962145298719406,
      "epoch": 0.14409448818897638,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08644682914018631,
      "kl": 0.006093928008340299,
      "learning_rate": 9.711903659101437e-07,
      "loss": -0.1471,
      "num_tokens": 85612283.0,
      "reward": 0.2566387951374054,
      "reward_std": 0.42777517437934875,
      "rewards/reward_func/mean": 0.2566387951374054,
      "rewards/reward_func/std": 0.42777520418167114,
      "step": 3111,
      "step_time": 30.46641080826521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 121.0,
      "completions/mean_terminated_length": 121.0,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.21173788979649544,
      "epoch": 0.14414080592867068,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032269067596644163,
      "kl": 0.0016220287652686238,
      "learning_rate": 9.711811023622046e-07,
      "loss": 0.0001,
      "num_tokens": 85631611.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3112,
      "step_time": 12.906226556748152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 176.875,
      "completions/mean_terminated_length": 176.875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.36915768682956696,
      "epoch": 0.14418712366836497,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037062610499560833,
      "kl": 0.0034915836877189577,
      "learning_rate": 9.711718388142657e-07,
      "loss": 0.0002,
      "num_tokens": 85665449.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3113,
      "step_time": 20.85910550132394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 117.9375,
      "completions/mean_terminated_length": 117.9375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2525225207209587,
      "epoch": 0.1442334414080593,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003564269980415702,
      "kl": 0.002008470590226352,
      "learning_rate": 9.711625752663268e-07,
      "loss": 0.0001,
      "num_tokens": 85686072.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3114,
      "step_time": 13.030921410769224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 177.125,
      "completions/mean_terminated_length": 177.125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.3929122984409332,
      "epoch": 0.1442797591477536,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006715687923133373,
      "kl": 0.004389499430544674,
      "learning_rate": 9.711533117183882e-07,
      "loss": 0.0002,
      "num_tokens": 85707482.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3115,
      "step_time": 18.148340705782175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.9375,
      "completions/mean_terminated_length": 132.9375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3351728692650795,
      "epoch": 0.1443260768874479,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021768605802208185,
      "kl": 0.0019831115496344864,
      "learning_rate": 9.711440481704493e-07,
      "loss": 0.0001,
      "num_tokens": 85729785.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3116,
      "step_time": 14.500599570572376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 351.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 308.3125,
      "completions/mean_terminated_length": 308.3125,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "entropy": 0.2655940279364586,
      "epoch": 0.1443723946271422,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026614207308739424,
      "kl": 0.0020910808816552162,
      "learning_rate": 9.711347846225104e-07,
      "loss": 0.0001,
      "num_tokens": 85765534.0,
      "reward": 0.8301368355751038,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8301368355751038,
      "rewards/reward_func/std": 0.0,
      "step": 3117,
      "step_time": 31.337754849344492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 157.4375,
      "completions/mean_terminated_length": 157.4375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4436837360262871,
      "epoch": 0.1444187123668365,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002486135810613632,
      "kl": 0.002556470106355846,
      "learning_rate": 9.711255210745716e-07,
      "loss": 0.0001,
      "num_tokens": 85808661.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3118,
      "step_time": 21.65603133663535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 141.25,
      "completions/mean_terminated_length": 141.25,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.31348419189453125,
      "epoch": 0.1444650301065308,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027583360206335783,
      "kl": 0.0016864509670995176,
      "learning_rate": 9.711162575266327e-07,
      "loss": 0.0001,
      "num_tokens": 85829481.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3119,
      "step_time": 14.478506825864315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 170.125,
      "completions/mean_terminated_length": 170.125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.4057074710726738,
      "epoch": 0.1445113478462251,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048325881361961365,
      "kl": 0.003486860659904778,
      "learning_rate": 9.711069939786938e-07,
      "loss": 0.0002,
      "num_tokens": 85851707.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3120,
      "step_time": 17.688097588717937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 241.1875,
      "completions/mean_terminated_length": 241.1875,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "entropy": 0.2984723672270775,
      "epoch": 0.1445576655859194,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07417890429496765,
      "kl": 0.0043800766579806805,
      "learning_rate": 9.71097730430755e-07,
      "loss": -0.0835,
      "num_tokens": 85881918.0,
      "reward": 0.05044776573777199,
      "reward_std": 0.10316508263349533,
      "rewards/reward_func/mean": 0.05044776573777199,
      "rewards/reward_func/std": 0.10316507518291473,
      "step": 3121,
      "step_time": 28.65020202472806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 155.875,
      "completions/mean_terminated_length": 155.875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.24655034393072128,
      "epoch": 0.14460398332561372,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035377475433051586,
      "kl": 0.0030263965018093586,
      "learning_rate": 9.71088466882816e-07,
      "loss": 0.0002,
      "num_tokens": 85904892.0,
      "reward": 0.010689839720726013,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.010689839720726013,
      "rewards/reward_func/std": 0.0,
      "step": 3122,
      "step_time": 16.88920145854354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 161.75,
      "completions/mean_terminated_length": 161.75,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.4134618043899536,
      "epoch": 0.14465030106530802,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022563552483916283,
      "kl": 0.0022778004640713334,
      "learning_rate": 9.710792033348772e-07,
      "loss": 0.0001,
      "num_tokens": 85942616.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3123,
      "step_time": 20.020273722708225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 246.4375,
      "completions/mean_terminated_length": 246.4375,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "entropy": 0.24199670553207397,
      "epoch": 0.14469661880500231,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06729083508253098,
      "kl": 0.004408994689583778,
      "learning_rate": 9.710699397869383e-07,
      "loss": -0.0398,
      "num_tokens": 85981791.0,
      "reward": 0.18782895803451538,
      "reward_std": 0.17358309030532837,
      "rewards/reward_func/mean": 0.18782895803451538,
      "rewards/reward_func/std": 0.17358307540416718,
      "step": 3124,
      "step_time": 28.185834880918264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 141.9375,
      "completions/mean_terminated_length": 141.9375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2049425132572651,
      "epoch": 0.1447429365446966,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032749222591519356,
      "kl": 0.0018695277394726872,
      "learning_rate": 9.710606762389994e-07,
      "loss": 0.0001,
      "num_tokens": 86004142.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 3125,
      "step_time": 16.163268078118563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 115.5,
      "completions/mean_terminated_length": 115.5,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.3004875108599663,
      "epoch": 0.14478925428439093,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002676882781088352,
      "kl": 0.0020285207428969443,
      "learning_rate": 9.710514126910606e-07,
      "loss": 0.0001,
      "num_tokens": 86025606.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3126,
      "step_time": 12.930851683020592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 150.125,
      "completions/mean_terminated_length": 150.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.40133099257946014,
      "epoch": 0.14483557202408523,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001862586592324078,
      "kl": 0.0019097040931228548,
      "learning_rate": 9.710421491431217e-07,
      "loss": 0.0001,
      "num_tokens": 86070696.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3127,
      "step_time": 22.270724210888147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 170.25,
      "completions/mean_terminated_length": 170.25,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.25261975824832916,
      "epoch": 0.14488188976377953,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13550913333892822,
      "kl": 0.012046173447743058,
      "learning_rate": 9.71032885595183e-07,
      "loss": 0.0374,
      "num_tokens": 86098908.0,
      "reward": 0.2242307960987091,
      "reward_std": 0.06592182070016861,
      "rewards/reward_func/mean": 0.2242307960987091,
      "rewards/reward_func/std": 0.06592182070016861,
      "step": 3128,
      "step_time": 20.090839847922325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 154.5625,
      "completions/mean_terminated_length": 154.5625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.38823341578245163,
      "epoch": 0.14492820750347382,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027239855844527483,
      "kl": 0.002623922482598573,
      "learning_rate": 9.710236220472441e-07,
      "loss": 0.0001,
      "num_tokens": 86121221.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3129,
      "step_time": 16.915862929075956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 121.125,
      "completions/mean_terminated_length": 121.125,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.23917216807603836,
      "epoch": 0.14497452524316815,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034652315080165863,
      "kl": 0.0022992915473878384,
      "learning_rate": 9.710143584993053e-07,
      "loss": 0.0001,
      "num_tokens": 86140695.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3130,
      "step_time": 14.590781509876251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 160.625,
      "completions/mean_terminated_length": 160.625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.27476590126752853,
      "epoch": 0.14502084298286244,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033477682154625654,
      "kl": 0.002532149839680642,
      "learning_rate": 9.710050949513664e-07,
      "loss": 0.0001,
      "num_tokens": 86162145.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 3131,
      "step_time": 16.802660521119833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 126.0,
      "completions/mean_terminated_length": 126.0,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2965754419565201,
      "epoch": 0.14506716072255674,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004325771238654852,
      "kl": 0.0022911931155249476,
      "learning_rate": 9.709958314034275e-07,
      "loss": 0.0001,
      "num_tokens": 86183601.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3132,
      "step_time": 13.96632957085967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 132.3125,
      "completions/mean_terminated_length": 132.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.31247203797101974,
      "epoch": 0.14511347846225103,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023194546811282635,
      "kl": 0.001823212078306824,
      "learning_rate": 9.709865678554886e-07,
      "loss": 0.0001,
      "num_tokens": 86216150.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3133,
      "step_time": 17.256319250911474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.24830194935202599,
      "epoch": 0.14515979620194536,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004885249305516481,
      "kl": 0.00285379181150347,
      "learning_rate": 9.709773043075498e-07,
      "loss": 0.0001,
      "num_tokens": 86235727.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3134,
      "step_time": 13.89758824929595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 185.8125,
      "completions/mean_terminated_length": 185.8125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3852306455373764,
      "epoch": 0.14520611394163965,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18450158834457397,
      "kl": 0.01743031432852149,
      "learning_rate": 9.709680407596109e-07,
      "loss": -0.0536,
      "num_tokens": 86257100.0,
      "reward": 0.5301079750061035,
      "reward_std": 0.4834319055080414,
      "rewards/reward_func/mean": 0.5301079750061035,
      "rewards/reward_func/std": 0.48343193531036377,
      "step": 3135,
      "step_time": 21.062769904732704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 152.4375,
      "completions/mean_terminated_length": 152.4375,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.18371649459004402,
      "epoch": 0.14525243168133395,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0074413837864995,
      "kl": 0.003688816213980317,
      "learning_rate": 9.70958777211672e-07,
      "loss": 0.0002,
      "num_tokens": 86292867.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 3136,
      "step_time": 19.577458258718252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 122.8125,
      "completions/mean_terminated_length": 122.8125,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3455648347735405,
      "epoch": 0.14529874942102824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024457850959151983,
      "kl": 0.0016058568144217134,
      "learning_rate": 9.709495136637331e-07,
      "loss": 0.0001,
      "num_tokens": 86328912.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3137,
      "step_time": 18.181072514504194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 172.5625,
      "completions/mean_terminated_length": 172.5625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.37562134116888046,
      "epoch": 0.14534506716072257,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036446305457502604,
      "kl": 0.0032337330048903823,
      "learning_rate": 9.709402501157943e-07,
      "loss": 0.0002,
      "num_tokens": 86362105.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3138,
      "step_time": 21.73952941223979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 131.125,
      "completions/mean_terminated_length": 131.125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.3309723660349846,
      "epoch": 0.14539138490041686,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027175559662282467,
      "kl": 0.002465819241479039,
      "learning_rate": 9.709309865678554e-07,
      "loss": 0.0001,
      "num_tokens": 86381995.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3139,
      "step_time": 13.53687021881342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 152.9375,
      "completions/mean_terminated_length": 152.9375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.18853257596492767,
      "epoch": 0.14543770264011116,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035405848175287247,
      "kl": 0.0023557813256047666,
      "learning_rate": 9.709217230199165e-07,
      "loss": 0.0001,
      "num_tokens": 86413770.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3140,
      "step_time": 18.62023865059018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 139.625,
      "completions/mean_terminated_length": 139.625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.31725310534238815,
      "epoch": 0.14548402037980546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017198971472680569,
      "kl": 0.001457408448914066,
      "learning_rate": 9.709124594719779e-07,
      "loss": 0.0001,
      "num_tokens": 86437156.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3141,
      "step_time": 15.070104915648699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 134.3125,
      "completions/mean_terminated_length": 134.3125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3369581550359726,
      "epoch": 0.14553033811949978,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037650528829544783,
      "kl": 0.0024292929738294333,
      "learning_rate": 9.70903195924039e-07,
      "loss": 0.0001,
      "num_tokens": 86460057.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3142,
      "step_time": 15.357313193380833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 188.9375,
      "completions/mean_terminated_length": 188.9375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.38247305154800415,
      "epoch": 0.14557665585919408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012070218799635768,
      "kl": 0.001781894447049126,
      "learning_rate": 9.708939323760999e-07,
      "loss": 0.0001,
      "num_tokens": 86519736.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3143,
      "step_time": 28.206488724797964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 210.6875,
      "completions/mean_terminated_length": 210.6875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.329260416328907,
      "epoch": 0.14562297359888837,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0919196680188179,
      "kl": 0.003121873422060162,
      "learning_rate": 9.70884668828161e-07,
      "loss": 0.0248,
      "num_tokens": 86552803.0,
      "reward": 0.012597200460731983,
      "reward_std": 0.0003570404660422355,
      "rewards/reward_func/mean": 0.012597200460731983,
      "rewards/reward_func/std": 0.0003570404078345746,
      "step": 3144,
      "step_time": 26.431716088205576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 180.375,
      "completions/mean_terminated_length": 180.375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.17420194670557976,
      "epoch": 0.14566929133858267,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11577330529689789,
      "kl": 0.00298696500249207,
      "learning_rate": 9.708754052802224e-07,
      "loss": -0.0038,
      "num_tokens": 86594889.0,
      "reward": 0.9507848024368286,
      "reward_std": 0.03160060942173004,
      "rewards/reward_func/mean": 0.9507848024368286,
      "rewards/reward_func/std": 0.03160062059760094,
      "step": 3145,
      "step_time": 22.716472662985325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 126.0,
      "completions/mean_terminated_length": 126.0,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.2812327370047569,
      "epoch": 0.145715609078277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022909201215952635,
      "kl": 0.0015351980400737375,
      "learning_rate": 9.708661417322835e-07,
      "loss": 0.0001,
      "num_tokens": 86614233.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3146,
      "step_time": 16.275097895413637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 116.0,
      "completions/mean_terminated_length": 116.0,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27915628254413605,
      "epoch": 0.1457619268179713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003274570917710662,
      "kl": 0.0017979002150241286,
      "learning_rate": 9.708568781843446e-07,
      "loss": 0.0001,
      "num_tokens": 86633657.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3147,
      "step_time": 13.511092584580183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 118.75,
      "completions/mean_terminated_length": 118.75,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2631353735923767,
      "epoch": 0.14580824455766558,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030122355092316866,
      "kl": 0.002176519948989153,
      "learning_rate": 9.708476146364057e-07,
      "loss": 0.0001,
      "num_tokens": 86653973.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3148,
      "step_time": 13.861914370208979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 133.3125,
      "completions/mean_terminated_length": 133.3125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.21980035305023193,
      "epoch": 0.14585456229735988,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002678386867046356,
      "kl": 0.0018450567149557173,
      "learning_rate": 9.708383510884669e-07,
      "loss": 0.0001,
      "num_tokens": 86673610.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3149,
      "step_time": 13.962938833981752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 133.0,
      "completions/mean_terminated_length": 133.0,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.27759916335344315,
      "epoch": 0.1459008800370542,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003311938839033246,
      "kl": 0.002035931043792516,
      "learning_rate": 9.70829087540528e-07,
      "loss": 0.0001,
      "num_tokens": 86701610.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3150,
      "step_time": 16.65987069159746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 151.5625,
      "completions/mean_terminated_length": 151.5625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.20273417234420776,
      "epoch": 0.1459471977767485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019732709042727947,
      "kl": 0.0019016769365407526,
      "learning_rate": 9.70819823992589e-07,
      "loss": 0.0001,
      "num_tokens": 86722371.0,
      "reward": 0.22313016653060913,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.22313016653060913,
      "rewards/reward_func/std": 0.0,
      "step": 3151,
      "step_time": 16.397979117929935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 174.9375,
      "completions/mean_terminated_length": 174.9375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.24122145399451256,
      "epoch": 0.1459935155164428,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008388607762753963,
      "kl": 0.00425613671541214,
      "learning_rate": 9.708105604446502e-07,
      "loss": 0.0002,
      "num_tokens": 86746674.0,
      "reward": 0.2822723090648651,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.2822723090648651,
      "rewards/reward_func/std": 0.0,
      "step": 3152,
      "step_time": 18.112127546221018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 136.4375,
      "completions/mean_terminated_length": 136.4375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2859875038266182,
      "epoch": 0.1460398332561371,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026483035180717707,
      "kl": 0.0019898385216947645,
      "learning_rate": 9.708012968967114e-07,
      "loss": 0.0001,
      "num_tokens": 86770857.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3153,
      "step_time": 16.893994688987732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 199.0625,
      "completions/mean_terminated_length": 199.0625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.1705622784793377,
      "epoch": 0.14608615099583142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.212143212556839,
      "kl": 0.004216172615997493,
      "learning_rate": 9.707920333487725e-07,
      "loss": 0.0482,
      "num_tokens": 86792730.0,
      "reward": 0.8634105324745178,
      "reward_std": 0.09766335785388947,
      "rewards/reward_func/mean": 0.8634105324745178,
      "rewards/reward_func/std": 0.09766335785388947,
      "step": 3154,
      "step_time": 20.41399770975113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 165.0625,
      "completions/mean_terminated_length": 165.0625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.24174608290195465,
      "epoch": 0.1461324687355257,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09421799331903458,
      "kl": 0.003022257413249463,
      "learning_rate": 9.707827698008336e-07,
      "loss": 0.017,
      "num_tokens": 86817227.0,
      "reward": 0.4943650960922241,
      "reward_std": 0.1352176070213318,
      "rewards/reward_func/mean": 0.4943650960922241,
      "rewards/reward_func/std": 0.1352176070213318,
      "step": 3155,
      "step_time": 18.282536655664444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 144.875,
      "completions/mean_terminated_length": 144.875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.32472915202379227,
      "epoch": 0.14617878647522,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005474370904266834,
      "kl": 0.002662398968823254,
      "learning_rate": 9.707735062528947e-07,
      "loss": 0.0001,
      "num_tokens": 86838217.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3156,
      "step_time": 15.807528786361217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 222.0,
      "completions/mean_terminated_length": 222.0,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.4381396025419235,
      "epoch": 0.1462251042149143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08899547159671783,
      "kl": 0.004325619898736477,
      "learning_rate": 9.707642427049559e-07,
      "loss": -0.0287,
      "num_tokens": 86872345.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3157,
      "step_time": 29.026765812188387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 169.6875,
      "completions/mean_terminated_length": 169.6875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.4100569635629654,
      "epoch": 0.14627142195460863,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005031133070588112,
      "kl": 0.0038968485314399004,
      "learning_rate": 9.707549791570172e-07,
      "loss": 0.0002,
      "num_tokens": 86898868.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3158,
      "step_time": 18.37262473627925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.22437533736228943,
      "epoch": 0.14631773969430292,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13961029052734375,
      "kl": 0.007461741857696325,
      "learning_rate": 9.707457156090783e-07,
      "loss": -0.0655,
      "num_tokens": 86921171.0,
      "reward": 0.6732373237609863,
      "reward_std": 0.3405424654483795,
      "rewards/reward_func/mean": 0.6732373237609863,
      "rewards/reward_func/std": 0.3405424654483795,
      "step": 3159,
      "step_time": 18.213964194059372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 154.375,
      "completions/mean_terminated_length": 154.375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3993169814348221,
      "epoch": 0.14636405743399722,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010213624686002731,
      "kl": 0.003526468004565686,
      "learning_rate": 9.707364520611394e-07,
      "loss": 0.0002,
      "num_tokens": 86951097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3160,
      "step_time": 17.57730484753847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 193.0,
      "completions/mean_terminated_length": 193.0,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.22103362530469894,
      "epoch": 0.14641037517369151,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11382729560136795,
      "kl": 0.0024669055710546672,
      "learning_rate": 9.707271885132006e-07,
      "loss": -0.0636,
      "num_tokens": 86975513.0,
      "reward": 0.5914242267608643,
      "reward_std": 0.47124677896499634,
      "rewards/reward_func/mean": 0.5914242267608643,
      "rewards/reward_func/std": 0.4712468087673187,
      "step": 3161,
      "step_time": 19.94507908821106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 121.8125,
      "completions/mean_terminated_length": 121.8125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.35823170840740204,
      "epoch": 0.14645669291338584,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034570039715617895,
      "kl": 0.002837582549545914,
      "learning_rate": 9.707179249652617e-07,
      "loss": 0.0001,
      "num_tokens": 87003062.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3162,
      "step_time": 15.633826054632664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 138.9375,
      "completions/mean_terminated_length": 138.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3633083179593086,
      "epoch": 0.14650301065308013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005952723324298859,
      "kl": 0.0032613181974738836,
      "learning_rate": 9.707086614173228e-07,
      "loss": 0.0002,
      "num_tokens": 87023733.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3163,
      "step_time": 16.495303072035313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 136.625,
      "completions/mean_terminated_length": 136.625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.26769497245550156,
      "epoch": 0.14654932839277443,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015609278343617916,
      "kl": 0.0012672932643909007,
      "learning_rate": 9.70699397869384e-07,
      "loss": 0.0001,
      "num_tokens": 87045103.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3164,
      "step_time": 14.988057252019644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 269.375,
      "completions/mean_terminated_length": 219.06668090820312,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.4312158152461052,
      "epoch": 0.14659564613246873,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07963003218173981,
      "kl": 0.008922230685129762,
      "learning_rate": 9.70690134321445e-07,
      "loss": -0.0593,
      "num_tokens": 87082725.0,
      "reward": 0.029000209644436836,
      "reward_std": 0.1158318817615509,
      "rewards/reward_func/mean": 0.029000209644436836,
      "rewards/reward_func/std": 0.1158318817615509,
      "step": 3165,
      "step_time": 82.63550824671984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 147.4375,
      "completions/mean_terminated_length": 147.4375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3034921735525131,
      "epoch": 0.14664196387216305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006009840406477451,
      "kl": 0.0034268525196239352,
      "learning_rate": 9.706808707735062e-07,
      "loss": 0.0002,
      "num_tokens": 87103004.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3166,
      "step_time": 16.10325925424695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 211.75,
      "completions/mean_terminated_length": 211.75,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.263863917440176,
      "epoch": 0.14668828161185735,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12380976229906082,
      "kl": 0.00807945930864662,
      "learning_rate": 9.706716072255673e-07,
      "loss": -0.0214,
      "num_tokens": 87132760.0,
      "reward": 0.8609727621078491,
      "reward_std": 0.24149809777736664,
      "rewards/reward_func/mean": 0.8609727621078491,
      "rewards/reward_func/std": 0.24149809777736664,
      "step": 3167,
      "step_time": 22.885518915951252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 147.5,
      "completions/mean_terminated_length": 147.5,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3837863504886627,
      "epoch": 0.14673459935155164,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017830505967140198,
      "kl": 0.002071816212264821,
      "learning_rate": 9.706623436776284e-07,
      "loss": 0.0001,
      "num_tokens": 87173712.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3168,
      "step_time": 21.10164326801896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 186.5,
      "completions/mean_terminated_length": 186.5,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.1757211908698082,
      "epoch": 0.14678091709124594,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004815271124243736,
      "kl": 0.0019804288749583066,
      "learning_rate": 9.706530801296896e-07,
      "loss": 0.0001,
      "num_tokens": 87197368.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3169,
      "step_time": 18.87556202709675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 215.5,
      "completions/mean_terminated_length": 215.5,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.24164295196533203,
      "epoch": 0.14682723483094026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1353035271167755,
      "kl": 0.00897871411871165,
      "learning_rate": 9.706438165817507e-07,
      "loss": -0.0041,
      "num_tokens": 87229216.0,
      "reward": 0.7134220600128174,
      "reward_std": 0.4254027307033539,
      "rewards/reward_func/mean": 0.7134220600128174,
      "rewards/reward_func/std": 0.4254027307033539,
      "step": 3170,
      "step_time": 23.80917016416788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 216.4375,
      "completions/mean_terminated_length": 216.4375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.2717517167329788,
      "epoch": 0.14687355257063456,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19454370439052582,
      "kl": 0.010912682628259063,
      "learning_rate": 9.70634553033812e-07,
      "loss": 0.0039,
      "num_tokens": 87260199.0,
      "reward": 0.8419197797775269,
      "reward_std": 0.3306751251220703,
      "rewards/reward_func/mean": 0.8419197797775269,
      "rewards/reward_func/std": 0.3306751251220703,
      "step": 3171,
      "step_time": 22.230905380100012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 200.0,
      "completions/mean_terminated_length": 200.0,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.41661446541547775,
      "epoch": 0.14691987031032885,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027504321187734604,
      "kl": 0.002257747430121526,
      "learning_rate": 9.706252894858731e-07,
      "loss": 0.0001,
      "num_tokens": 87317207.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3172,
      "step_time": 30.25461930781603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 183.3125,
      "completions/mean_terminated_length": 183.3125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.22347508743405342,
      "epoch": 0.14696618805002315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032916993368417025,
      "kl": 0.002332076954189688,
      "learning_rate": 9.706160259379343e-07,
      "loss": 0.0001,
      "num_tokens": 87344204.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 3173,
      "step_time": 19.114656172692776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 276.0625,
      "completions/mean_terminated_length": 276.0625,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.25793934240937233,
      "epoch": 0.14701250578971747,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08309826999902725,
      "kl": 0.00859370466787368,
      "learning_rate": 9.706067623899952e-07,
      "loss": -0.1086,
      "num_tokens": 87377437.0,
      "reward": 0.7614637613296509,
      "reward_std": 0.38147062063217163,
      "rewards/reward_func/mean": 0.7614637613296509,
      "rewards/reward_func/std": 0.381470650434494,
      "step": 3174,
      "step_time": 28.57931701466441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 219.25,
      "completions/mean_terminated_length": 219.25,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.20336568355560303,
      "epoch": 0.14705882352941177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005387068260461092,
      "kl": 0.0037300767726264894,
      "learning_rate": 9.705974988420565e-07,
      "loss": 0.0002,
      "num_tokens": 87415377.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3175,
      "step_time": 23.53774171322584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 149.1875,
      "completions/mean_terminated_length": 149.1875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.24261876940727234,
      "epoch": 0.14710514126910607,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006944534834474325,
      "kl": 0.002476296591339633,
      "learning_rate": 9.705882352941176e-07,
      "loss": 0.0001,
      "num_tokens": 87435172.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3176,
      "step_time": 15.371956024318933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 135.9375,
      "completions/mean_terminated_length": 135.9375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.24093306064605713,
      "epoch": 0.14715145900880036,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009514703415334225,
      "kl": 0.004567248863168061,
      "learning_rate": 9.705789717461788e-07,
      "loss": 0.0002,
      "num_tokens": 87455315.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3177,
      "step_time": 13.69770859926939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 175.3125,
      "completions/mean_terminated_length": 175.3125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.24081763625144958,
      "epoch": 0.14719777674849469,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016568658174946904,
      "kl": 0.0015491091180592775,
      "learning_rate": 9.7056970819824e-07,
      "loss": 0.0001,
      "num_tokens": 87478216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3178,
      "step_time": 19.73670904710889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 122.9375,
      "completions/mean_terminated_length": 122.9375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2850130945444107,
      "epoch": 0.14724409448818898,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033600255846977234,
      "kl": 0.0024400824040640146,
      "learning_rate": 9.70560444650301e-07,
      "loss": 0.0001,
      "num_tokens": 87499943.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3179,
      "step_time": 13.772666417062283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 137.1875,
      "completions/mean_terminated_length": 137.1875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2709697335958481,
      "epoch": 0.14729041222788328,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004015101585537195,
      "kl": 0.002116362185915932,
      "learning_rate": 9.705511811023621e-07,
      "loss": 0.0001,
      "num_tokens": 87523626.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3180,
      "step_time": 15.20681268721819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 138.75,
      "completions/mean_terminated_length": 138.75,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2954035848379135,
      "epoch": 0.14733672996757757,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003261976409703493,
      "kl": 0.0021558912412729114,
      "learning_rate": 9.705419175544233e-07,
      "loss": 0.0001,
      "num_tokens": 87550550.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3181,
      "step_time": 15.935935780405998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.37066788971424103,
      "epoch": 0.1473830477072719,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011409871280193329,
      "kl": 0.005945907789282501,
      "learning_rate": 9.705326540064844e-07,
      "loss": 0.0003,
      "num_tokens": 87573057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3182,
      "step_time": 21.450191903859377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 194.8125,
      "completions/mean_terminated_length": 194.8125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.31197257339954376,
      "epoch": 0.1474293654469662,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16130420565605164,
      "kl": 0.0033873734064400196,
      "learning_rate": 9.705233904585455e-07,
      "loss": -0.1348,
      "num_tokens": 87607534.0,
      "reward": 0.4347401559352875,
      "reward_std": 0.41662493348121643,
      "rewards/reward_func/mean": 0.4347401559352875,
      "rewards/reward_func/std": 0.41662493348121643,
      "step": 3183,
      "step_time": 24.65077030658722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 170.875,
      "completions/mean_terminated_length": 170.875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.17241237312555313,
      "epoch": 0.1474756831866605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002606244059279561,
      "kl": 0.001355106389382854,
      "learning_rate": 9.705141269106066e-07,
      "loss": 0.0001,
      "num_tokens": 87656268.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 3184,
      "step_time": 24.78933433443308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 441.0,
      "completions/max_terminated_length": 441.0,
      "completions/mean_length": 227.375,
      "completions/mean_terminated_length": 227.375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.3972332626581192,
      "epoch": 0.14752200092635478,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08933868259191513,
      "kl": 0.006041689426638186,
      "learning_rate": 9.70504863362668e-07,
      "loss": -0.3568,
      "num_tokens": 87683954.0,
      "reward": 0.1817312240600586,
      "reward_std": 0.39071038365364075,
      "rewards/reward_func/mean": 0.1817312240600586,
      "rewards/reward_func/std": 0.39071041345596313,
      "step": 3185,
      "step_time": 36.7055429071188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 180.875,
      "completions/mean_terminated_length": 180.875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.378615565598011,
      "epoch": 0.1475683186660491,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1564921885728836,
      "kl": 0.005293174646794796,
      "learning_rate": 9.70495599814729e-07,
      "loss": 0.0716,
      "num_tokens": 87712528.0,
      "reward": 0.3393140435218811,
      "reward_std": 0.4524186849594116,
      "rewards/reward_func/mean": 0.3393140435218811,
      "rewards/reward_func/std": 0.4524187445640564,
      "step": 3186,
      "step_time": 21.542234182357788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 184.75,
      "completions/mean_terminated_length": 184.75,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.37833455204963684,
      "epoch": 0.1476146364057434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026527580339461565,
      "kl": 0.00210870907176286,
      "learning_rate": 9.7048633626679e-07,
      "loss": 0.0001,
      "num_tokens": 87760604.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3187,
      "step_time": 25.973256528377533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 203.125,
      "completions/mean_terminated_length": 203.125,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.3513263314962387,
      "epoch": 0.1476609541454377,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036778149660676718,
      "kl": 0.0033898663241416216,
      "learning_rate": 9.704770727188514e-07,
      "loss": 0.0002,
      "num_tokens": 87794494.0,
      "reward": 0.169904425740242,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.169904425740242,
      "rewards/reward_func/std": 0.0,
      "step": 3188,
      "step_time": 22.684977620840073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 203.9375,
      "completions/mean_terminated_length": 203.9375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.4454081431031227,
      "epoch": 0.147707271885132,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1215273067355156,
      "kl": 0.008781867567449808,
      "learning_rate": 9.704678091709125e-07,
      "loss": 0.0495,
      "num_tokens": 87818925.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3189,
      "step_time": 21.12729962915182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 199.125,
      "completions/mean_terminated_length": 199.125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.23385056480765343,
      "epoch": 0.14775358962482632,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016233468195423484,
      "kl": 0.0012905322655569762,
      "learning_rate": 9.704585456229736e-07,
      "loss": 0.0001,
      "num_tokens": 87853327.0,
      "reward": 0.6147881746292114,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6147881746292114,
      "rewards/reward_func/std": 0.0,
      "step": 3190,
      "step_time": 22.21422252431512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 213.5,
      "completions/mean_terminated_length": 213.5,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.24486621469259262,
      "epoch": 0.14779990736452062,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017838386120274663,
      "kl": 0.0016516250325366855,
      "learning_rate": 9.704492820750347e-07,
      "loss": 0.0001,
      "num_tokens": 87882647.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3191,
      "step_time": 21.689008958637714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 170.75,
      "completions/mean_terminated_length": 170.75,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.39651864767074585,
      "epoch": 0.1478462251042149,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006718486547470093,
      "kl": 0.004706620529759675,
      "learning_rate": 9.704400185270959e-07,
      "loss": 0.0002,
      "num_tokens": 87920563.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3192,
      "step_time": 22.389150597155094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 172.5,
      "completions/mean_terminated_length": 172.5,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.210244283080101,
      "epoch": 0.1478925428439092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016268540639430285,
      "kl": 0.001841002085711807,
      "learning_rate": 9.70430754979157e-07,
      "loss": 0.0001,
      "num_tokens": 87942907.0,
      "reward": 0.5471704602241516,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5471704602241516,
      "rewards/reward_func/std": 0.0,
      "step": 3193,
      "step_time": 17.487916626036167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 242.8125,
      "completions/mean_terminated_length": 242.8125,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.17763400077819824,
      "epoch": 0.14793886058360353,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006192723289132118,
      "kl": 0.003738157683983445,
      "learning_rate": 9.704214914312181e-07,
      "loss": 0.0002,
      "num_tokens": 87967928.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3194,
      "step_time": 24.626737490296364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 209.125,
      "completions/mean_terminated_length": 209.125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.27085962146520615,
      "epoch": 0.14798517832329783,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08795936405658722,
      "kl": 0.0032920980593189597,
      "learning_rate": 9.704122278832792e-07,
      "loss": 0.0384,
      "num_tokens": 88005498.0,
      "reward": 0.9402012825012207,
      "reward_std": 0.16340123116970062,
      "rewards/reward_func/mean": 0.9402012825012207,
      "rewards/reward_func/std": 0.16340124607086182,
      "step": 3195,
      "step_time": 24.832705087959766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 147.9375,
      "completions/mean_terminated_length": 147.9375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.34720899164676666,
      "epoch": 0.14803149606299212,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00362460152246058,
      "kl": 0.0026282008038833737,
      "learning_rate": 9.704029643353404e-07,
      "loss": 0.0001,
      "num_tokens": 88036553.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3196,
      "step_time": 17.945649698376656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 120.875,
      "completions/mean_terminated_length": 120.875,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.3234320357441902,
      "epoch": 0.14807781380268642,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002180765150114894,
      "kl": 0.0016997777274809778,
      "learning_rate": 9.703937007874015e-07,
      "loss": 0.0001,
      "num_tokens": 88057847.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3197,
      "step_time": 13.488488294184208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.403076708316803,
      "epoch": 0.14812413154238074,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007377514149993658,
      "kl": 0.003922757343389094,
      "learning_rate": 9.703844372394626e-07,
      "loss": 0.0002,
      "num_tokens": 88101066.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3198,
      "step_time": 20.92857074737549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 416.0,
      "completions/max_terminated_length": 416.0,
      "completions/mean_length": 350.5,
      "completions/mean_terminated_length": 350.5,
      "completions/min_length": 289.0,
      "completions/min_terminated_length": 289.0,
      "entropy": 0.3393295705318451,
      "epoch": 0.14817044928207504,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003656057408079505,
      "kl": 0.0035902425879612565,
      "learning_rate": 9.703751736915237e-07,
      "loss": 0.0002,
      "num_tokens": 88142146.0,
      "reward": 0.7663013935089111,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7663013935089111,
      "rewards/reward_func/std": 0.0,
      "step": 3199,
      "step_time": 37.4694101922214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 123.0625,
      "completions/mean_terminated_length": 123.0625,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.30981073528528214,
      "epoch": 0.14821676702176934,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005902593489736319,
      "kl": 0.0032484132098034024,
      "learning_rate": 9.703659101435849e-07,
      "loss": 0.0002,
      "num_tokens": 88161651.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3200,
      "step_time": 13.90316016599536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 170.8125,
      "completions/mean_terminated_length": 170.8125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3382454290986061,
      "epoch": 0.14826308476146363,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009572918526828289,
      "kl": 0.005233524250797927,
      "learning_rate": 9.703566465956462e-07,
      "loss": 0.0003,
      "num_tokens": 88182048.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3201,
      "step_time": 17.268680974841118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 180.8125,
      "completions/mean_terminated_length": 180.8125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.22474030777812004,
      "epoch": 0.14830940250115796,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004835336469113827,
      "kl": 0.0031781523721292615,
      "learning_rate": 9.703473830477073e-07,
      "loss": 0.0002,
      "num_tokens": 88206765.0,
      "reward": 0.9534969329833984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9534969329833984,
      "rewards/reward_func/std": 0.0,
      "step": 3202,
      "step_time": 19.9906326495111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 202.25,
      "completions/mean_terminated_length": 202.25,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.36052901297807693,
      "epoch": 0.14835572024085225,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09114061295986176,
      "kl": 0.007722419453784823,
      "learning_rate": 9.703381194997684e-07,
      "loss": -0.0326,
      "num_tokens": 88229025.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3203,
      "step_time": 20.64932259172201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 191.75,
      "completions/mean_terminated_length": 191.75,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.1975666582584381,
      "epoch": 0.14840203798054655,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023789252154529095,
      "kl": 0.0017853862955234945,
      "learning_rate": 9.703288559518296e-07,
      "loss": 0.0001,
      "num_tokens": 88252541.0,
      "reward": 0.32680743932724,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.32680743932724,
      "rewards/reward_func/std": 0.0,
      "step": 3204,
      "step_time": 19.48708562925458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.3965426906943321,
      "epoch": 0.14844835572024084,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024332997854799032,
      "kl": 0.002204675634857267,
      "learning_rate": 9.703195924038907e-07,
      "loss": 0.0001,
      "num_tokens": 88298277.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3205,
      "step_time": 23.821564003825188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 334.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 265.875,
      "completions/mean_terminated_length": 265.875,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "entropy": 0.26234976947307587,
      "epoch": 0.14849467345993517,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08036817610263824,
      "kl": 0.007552730618044734,
      "learning_rate": 9.703103288559518e-07,
      "loss": 0.0607,
      "num_tokens": 88328707.0,
      "reward": 0.8278868198394775,
      "reward_std": 0.23553621768951416,
      "rewards/reward_func/mean": 0.8278868198394775,
      "rewards/reward_func/std": 0.23553623259067535,
      "step": 3206,
      "step_time": 28.886310018599033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 119.75,
      "completions/mean_terminated_length": 119.75,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.24381082132458687,
      "epoch": 0.14854099119962946,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006969882640987635,
      "kl": 0.002445285295834765,
      "learning_rate": 9.70301065308013e-07,
      "loss": 0.0001,
      "num_tokens": 88348367.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3207,
      "step_time": 14.086081974208355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 119.0,
      "completions/mean_terminated_length": 119.0,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.29726671427488327,
      "epoch": 0.14858730893932376,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015938072465360165,
      "kl": 0.0015336872020270675,
      "learning_rate": 9.70291801760074e-07,
      "loss": 0.0001,
      "num_tokens": 88371119.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3208,
      "step_time": 13.541106462478638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 164.25,
      "completions/mean_terminated_length": 164.25,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.116773946210742,
      "epoch": 0.14863362667901805,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1421230435371399,
      "kl": 0.0242515429854393,
      "learning_rate": 9.702825382121352e-07,
      "loss": -0.0416,
      "num_tokens": 88393571.0,
      "reward": 0.9391771554946899,
      "reward_std": 0.1661996841430664,
      "rewards/reward_func/mean": 0.9391771554946899,
      "rewards/reward_func/std": 0.1661996990442276,
      "step": 3209,
      "step_time": 16.628654144704342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 200.0,
      "completions/mean_terminated_length": 200.0,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.24974997341632843,
      "epoch": 0.14867994441871238,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004038608632981777,
      "kl": 0.003384222276508808,
      "learning_rate": 9.702732746641963e-07,
      "loss": 0.0002,
      "num_tokens": 88419571.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3210,
      "step_time": 21.770447835326195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 213.625,
      "completions/mean_terminated_length": 213.625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.44399744272232056,
      "epoch": 0.14872626215840667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036541500594466925,
      "kl": 0.0028832972166128457,
      "learning_rate": 9.702640111162574e-07,
      "loss": 0.0001,
      "num_tokens": 88448765.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3211,
      "step_time": 23.496972754597664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 162.375,
      "completions/mean_terminated_length": 162.375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.14537383988499641,
      "epoch": 0.14877257989810097,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004367548506706953,
      "kl": 0.0027702118968591094,
      "learning_rate": 9.702547475683186e-07,
      "loss": 0.0001,
      "num_tokens": 88470947.0,
      "reward": 0.8250529766082764,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8250529766082764,
      "rewards/reward_func/std": 0.0,
      "step": 3212,
      "step_time": 16.668695371598005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 141.875,
      "completions/mean_terminated_length": 141.875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.21356702968478203,
      "epoch": 0.14881889763779527,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032490375451743603,
      "kl": 0.002257484768051654,
      "learning_rate": 9.702454840203797e-07,
      "loss": 0.0001,
      "num_tokens": 88490945.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3213,
      "step_time": 13.733813617378473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 148.6875,
      "completions/mean_terminated_length": 148.6875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3643890991806984,
      "epoch": 0.1488652153774896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014543255092576146,
      "kl": 0.0017717880546115339,
      "learning_rate": 9.702362204724408e-07,
      "loss": 0.0001,
      "num_tokens": 88543036.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3214,
      "step_time": 23.729951851069927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 180.9375,
      "completions/mean_terminated_length": 180.9375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.36227110773324966,
      "epoch": 0.1489115331171839,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00365792540833354,
      "kl": 0.0025077573372982442,
      "learning_rate": 9.702269569245022e-07,
      "loss": 0.0001,
      "num_tokens": 88570475.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3215,
      "step_time": 22.424935221672058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 194.375,
      "completions/mean_terminated_length": 194.375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.37817592918872833,
      "epoch": 0.14895785085687818,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12057393044233322,
      "kl": 0.008120844489894807,
      "learning_rate": 9.702176933765633e-07,
      "loss": -0.0371,
      "num_tokens": 88592369.0,
      "reward": 0.6736177206039429,
      "reward_std": 0.4698963463306427,
      "rewards/reward_func/mean": 0.6736177206039429,
      "rewards/reward_func/std": 0.4698963761329651,
      "step": 3216,
      "step_time": 19.30144726112485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 139.1875,
      "completions/mean_terminated_length": 139.1875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.1959523782134056,
      "epoch": 0.14900416859657248,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.128775954246521,
      "kl": 0.020842507015913725,
      "learning_rate": 9.702084298286242e-07,
      "loss": -0.105,
      "num_tokens": 88618676.0,
      "reward": 0.45511555671691895,
      "reward_std": 0.3485543727874756,
      "rewards/reward_func/mean": 0.45511555671691895,
      "rewards/reward_func/std": 0.3485543727874756,
      "step": 3217,
      "step_time": 16.61471777409315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 139.875,
      "completions/mean_terminated_length": 139.875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.31921062618494034,
      "epoch": 0.1490504863362668,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025665706489235163,
      "kl": 0.0021532553946599364,
      "learning_rate": 9.701991662806855e-07,
      "loss": 0.0001,
      "num_tokens": 88640482.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3218,
      "step_time": 15.491533864289522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 204.6875,
      "completions/mean_terminated_length": 204.6875,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.3016367256641388,
      "epoch": 0.1490968040759611,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037496606819331646,
      "kl": 0.0024888652842491865,
      "learning_rate": 9.701899027327467e-07,
      "loss": 0.0001,
      "num_tokens": 88666141.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3219,
      "step_time": 20.989007283002138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 194.0,
      "completions/mean_terminated_length": 194.0,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.18908924981951714,
      "epoch": 0.1491431218156554,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033928302582353354,
      "kl": 0.0019500574562698603,
      "learning_rate": 9.701806391848078e-07,
      "loss": 0.0001,
      "num_tokens": 88697245.0,
      "reward": 0.9534969329833984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9534969329833984,
      "rewards/reward_func/std": 0.0,
      "step": 3220,
      "step_time": 21.1616225913167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 148.3125,
      "completions/mean_terminated_length": 148.3125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.2388499341905117,
      "epoch": 0.1491894395553497,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10226103663444519,
      "kl": 0.0020513379713520408,
      "learning_rate": 9.70171375636869e-07,
      "loss": 0.008,
      "num_tokens": 88718754.0,
      "reward": 0.9940523505210876,
      "reward_std": 0.023790646344423294,
      "rewards/reward_func/mean": 0.9940523505210876,
      "rewards/reward_func/std": 0.023790642619132996,
      "step": 3221,
      "step_time": 15.831282813102007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 116.875,
      "completions/mean_terminated_length": 116.875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2894498035311699,
      "epoch": 0.149235757295044,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027666990645229816,
      "kl": 0.0019624547276180238,
      "learning_rate": 9.7016211208893e-07,
      "loss": 0.0001,
      "num_tokens": 88739360.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3222,
      "step_time": 13.560561783611774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 277.4375,
      "completions/mean_terminated_length": 277.4375,
      "completions/min_length": 253.0,
      "completions/min_terminated_length": 253.0,
      "entropy": 0.2797243595123291,
      "epoch": 0.1492820750347383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012131128460168839,
      "kl": 0.007326827384531498,
      "learning_rate": 9.701528485409912e-07,
      "loss": 0.0004,
      "num_tokens": 88764935.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3223,
      "step_time": 26.064146503806114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 161.4375,
      "completions/mean_terminated_length": 161.4375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.33683259040117264,
      "epoch": 0.1493283927744326,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005655890330672264,
      "kl": 0.0031368256313726306,
      "learning_rate": 9.701435849930523e-07,
      "loss": 0.0002,
      "num_tokens": 88800158.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3224,
      "step_time": 20.38260020688176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 155.5,
      "completions/mean_terminated_length": 155.5,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.1556338369846344,
      "epoch": 0.1493747105141269,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18061292171478271,
      "kl": 0.0028197705396451056,
      "learning_rate": 9.701343214451134e-07,
      "loss": 0.0009,
      "num_tokens": 88822134.0,
      "reward": 0.9353712797164917,
      "reward_std": 0.01723429374396801,
      "rewards/reward_func/mean": 0.9353712797164917,
      "rewards/reward_func/std": 0.017234310507774353,
      "step": 3225,
      "step_time": 16.026508655399084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 121.0,
      "completions/max_terminated_length": 121.0,
      "completions/mean_length": 110.0,
      "completions/mean_terminated_length": 110.0,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2682938277721405,
      "epoch": 0.14942102825382123,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004825829993933439,
      "kl": 0.0025337798288092017,
      "learning_rate": 9.701250578971745e-07,
      "loss": 0.0001,
      "num_tokens": 88842390.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3226,
      "step_time": 12.033143199980259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 208.8125,
      "completions/mean_terminated_length": 208.8125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.4151565060019493,
      "epoch": 0.14946734599351552,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11671051383018494,
      "kl": 0.009419246343895793,
      "learning_rate": 9.701157943492357e-07,
      "loss": -0.0443,
      "num_tokens": 88872707.0,
      "reward": 0.1004081666469574,
      "reward_std": 0.2836160361766815,
      "rewards/reward_func/mean": 0.1004081666469574,
      "rewards/reward_func/std": 0.2836160361766815,
      "step": 3227,
      "step_time": 23.99384493380785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.38457028567790985,
      "epoch": 0.14951366373320982,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025691171176731586,
      "kl": 0.0020561676064971834,
      "learning_rate": 9.70106530801297e-07,
      "loss": 0.0001,
      "num_tokens": 88911767.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3228,
      "step_time": 20.47853649035096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.1561616212129593,
      "epoch": 0.1495599814729041,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015544591005891562,
      "kl": 0.0013966645346954465,
      "learning_rate": 9.70097267253358e-07,
      "loss": 0.0001,
      "num_tokens": 88935419.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 3229,
      "step_time": 17.219679478555918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 124.875,
      "completions/mean_terminated_length": 124.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.29462575912475586,
      "epoch": 0.14960629921259844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028986926190555096,
      "kl": 0.0019535927567631006,
      "learning_rate": 9.70088003705419e-07,
      "loss": 0.0001,
      "num_tokens": 88957065.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3230,
      "step_time": 13.661428939551115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.1987394392490387,
      "epoch": 0.14965261695229273,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12018464505672455,
      "kl": 0.020558612421154976,
      "learning_rate": 9.700787401574804e-07,
      "loss": -0.0604,
      "num_tokens": 88994118.0,
      "reward": 0.7430945634841919,
      "reward_std": 0.20552437007427216,
      "rewards/reward_func/mean": 0.7430945634841919,
      "rewards/reward_func/std": 0.20552437007427216,
      "step": 3231,
      "step_time": 21.399724923074245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 123.5,
      "completions/mean_terminated_length": 123.5,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.24140437692403793,
      "epoch": 0.14969893469198703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00472430232912302,
      "kl": 0.002421194250928238,
      "learning_rate": 9.700694766095415e-07,
      "loss": 0.0001,
      "num_tokens": 89013662.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3232,
      "step_time": 13.645048223435879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 177.1875,
      "completions/mean_terminated_length": 177.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3778412416577339,
      "epoch": 0.14974525243168132,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00442263949662447,
      "kl": 0.003878927731420845,
      "learning_rate": 9.700602130616026e-07,
      "loss": 0.0002,
      "num_tokens": 89042033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3233,
      "step_time": 21.010863408446312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 144.375,
      "completions/mean_terminated_length": 144.375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.30326658487319946,
      "epoch": 0.14979157017137565,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00421948591247201,
      "kl": 0.0025798884744290262,
      "learning_rate": 9.700509495136637e-07,
      "loss": 0.0001,
      "num_tokens": 89064583.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3234,
      "step_time": 16.326236341148615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 135.625,
      "completions/mean_terminated_length": 135.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2790285795927048,
      "epoch": 0.14983788791106994,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002742520533502102,
      "kl": 0.0020693527185358107,
      "learning_rate": 9.700416859657249e-07,
      "loss": 0.0001,
      "num_tokens": 89087937.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3235,
      "step_time": 15.345882892608643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 148.875,
      "completions/mean_terminated_length": 148.875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.2681538164615631,
      "epoch": 0.14988420565076424,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016844841884449124,
      "kl": 0.0013607275031972677,
      "learning_rate": 9.70032422417786e-07,
      "loss": 0.0001,
      "num_tokens": 89108703.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3236,
      "step_time": 16.236670289188623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 176.375,
      "completions/mean_terminated_length": 176.375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.19332417845726013,
      "epoch": 0.14993052339045854,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010703495936468244,
      "kl": 0.0008461621328024194,
      "learning_rate": 9.700231588698471e-07,
      "loss": 0.0,
      "num_tokens": 89130357.0,
      "reward": 0.3011942207813263,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3011942207813263,
      "rewards/reward_func/std": 0.0,
      "step": 3237,
      "step_time": 18.122743774205446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 297.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 236.1875,
      "completions/mean_terminated_length": 236.1875,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.21174217388033867,
      "epoch": 0.14997684113015286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06723204255104065,
      "kl": 0.0013857937447028235,
      "learning_rate": 9.700138953219082e-07,
      "loss": -0.0366,
      "num_tokens": 89164504.0,
      "reward": 0.9305884838104248,
      "reward_std": 0.018509721383452415,
      "rewards/reward_func/mean": 0.9305884838104248,
      "rewards/reward_func/std": 0.01850973069667816,
      "step": 3238,
      "step_time": 27.427233666181564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 345.0,
      "completions/max_terminated_length": 345.0,
      "completions/mean_length": 215.5625,
      "completions/mean_terminated_length": 215.5625,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.46943120658397675,
      "epoch": 0.15002315886984716,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004929953720420599,
      "kl": 0.0032900521182455122,
      "learning_rate": 9.700046317739694e-07,
      "loss": 0.0002,
      "num_tokens": 89191793.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3239,
      "step_time": 29.97055809572339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 133.875,
      "completions/mean_terminated_length": 133.875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.26193320006132126,
      "epoch": 0.15006947660954145,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027752660680562258,
      "kl": 0.0021281801746226847,
      "learning_rate": 9.699953682260305e-07,
      "loss": 0.0001,
      "num_tokens": 89211471.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3240,
      "step_time": 14.621359758079052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 157.1875,
      "completions/mean_terminated_length": 157.1875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.36254987865686417,
      "epoch": 0.15011579434923575,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005949254613369703,
      "kl": 0.0031668933806940913,
      "learning_rate": 9.699861046780916e-07,
      "loss": 0.0002,
      "num_tokens": 89237138.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3241,
      "step_time": 17.5031932964921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 121.25,
      "completions/mean_terminated_length": 121.25,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.29025503993034363,
      "epoch": 0.15016211208893007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004383976571261883,
      "kl": 0.002748581231571734,
      "learning_rate": 9.699768411301527e-07,
      "loss": 0.0001,
      "num_tokens": 89256966.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3242,
      "step_time": 13.214446749538183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 209.625,
      "completions/mean_terminated_length": 209.625,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.3311644718050957,
      "epoch": 0.15020842982862437,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10076361149549484,
      "kl": 0.008241066941991448,
      "learning_rate": 9.699675775822139e-07,
      "loss": 0.0323,
      "num_tokens": 89292432.0,
      "reward": 0.7135276794433594,
      "reward_std": 0.35400888323783875,
      "rewards/reward_func/mean": 0.7135276794433594,
      "rewards/reward_func/std": 0.35400888323783875,
      "step": 3243,
      "step_time": 26.671080119907856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 147.875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.37825947999954224,
      "epoch": 0.15025474756831866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032125997822731733,
      "kl": 0.0022422123001888394,
      "learning_rate": 9.69958314034275e-07,
      "loss": 0.0001,
      "num_tokens": 89349246.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3244,
      "step_time": 24.166430916637182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 138.8125,
      "completions/mean_terminated_length": 138.8125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.30234841257333755,
      "epoch": 0.15030106530801296,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018751475727185607,
      "kl": 0.001480356848333031,
      "learning_rate": 9.699490504863363e-07,
      "loss": 0.0001,
      "num_tokens": 89375211.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3245,
      "step_time": 16.059069741517305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 211.875,
      "completions/mean_terminated_length": 211.875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.3353811204433441,
      "epoch": 0.15034738304770728,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10169153660535812,
      "kl": 0.00472992321010679,
      "learning_rate": 9.699397869383974e-07,
      "loss": -0.0405,
      "num_tokens": 89405737.0,
      "reward": 0.3852382004261017,
      "reward_std": 0.4519440531730652,
      "rewards/reward_func/mean": 0.3852382004261017,
      "rewards/reward_func/std": 0.4519440829753876,
      "step": 3246,
      "step_time": 22.704330950975418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 129.625,
      "completions/mean_terminated_length": 129.625,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.3572033792734146,
      "epoch": 0.15039370078740158,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028394977562129498,
      "kl": 0.0023871789453551173,
      "learning_rate": 9.699305233904586e-07,
      "loss": 0.0001,
      "num_tokens": 89440691.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3247,
      "step_time": 17.94020389392972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 152.125,
      "completions/mean_terminated_length": 152.125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.2701178193092346,
      "epoch": 0.15044001852709588,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001421017455868423,
      "kl": 0.0012700442457571626,
      "learning_rate": 9.699212598425197e-07,
      "loss": 0.0001,
      "num_tokens": 89461589.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3248,
      "step_time": 16.179645586758852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 137.375,
      "completions/mean_terminated_length": 137.375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.22720123454928398,
      "epoch": 0.15048633626679017,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002331890631467104,
      "kl": 0.0015017892292235047,
      "learning_rate": 9.699119962945808e-07,
      "loss": 0.0001,
      "num_tokens": 89481275.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3249,
      "step_time": 14.21345742419362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 178.9375,
      "completions/mean_terminated_length": 178.9375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.12625334411859512,
      "epoch": 0.1505326540064845,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016706647584214807,
      "kl": 0.001029629958793521,
      "learning_rate": 9.69902732746642e-07,
      "loss": 0.0001,
      "num_tokens": 89509418.0,
      "reward": 0.3425188660621643,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3425188660621643,
      "rewards/reward_func/std": 0.0,
      "step": 3250,
      "step_time": 18.146559350192547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 131.8125,
      "completions/mean_terminated_length": 131.8125,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.1958547979593277,
      "epoch": 0.1505789717461788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005485287867486477,
      "kl": 0.002891829004511237,
      "learning_rate": 9.69893469198703e-07,
      "loss": 0.0001,
      "num_tokens": 89533063.0,
      "reward": 0.09697196632623672,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.09697196632623672,
      "rewards/reward_func/std": 0.0,
      "step": 3251,
      "step_time": 15.232710171490908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 143.125,
      "completions/mean_terminated_length": 143.125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.2940206602215767,
      "epoch": 0.1506252894858731,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020695466082543135,
      "kl": 0.0013882204075343907,
      "learning_rate": 9.698842056507642e-07,
      "loss": 0.0001,
      "num_tokens": 89556281.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3252,
      "step_time": 16.643860213458538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 194.9375,
      "completions/mean_terminated_length": 194.9375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.34039029479026794,
      "epoch": 0.15067160722556738,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015669554704800248,
      "kl": 0.0017213832470588386,
      "learning_rate": 9.698749421028253e-07,
      "loss": 0.0001,
      "num_tokens": 89585144.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3253,
      "step_time": 20.878741156309843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 129.0625,
      "completions/mean_terminated_length": 129.0625,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.29765769094228745,
      "epoch": 0.1507179249652617,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023812558501958847,
      "kl": 0.0018762968538794667,
      "learning_rate": 9.698656785548864e-07,
      "loss": 0.0001,
      "num_tokens": 89613801.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3254,
      "step_time": 16.61073511093855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.31266072392463684,
      "epoch": 0.150764242704956,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004924772307276726,
      "kl": 0.0032038383651524782,
      "learning_rate": 9.698564150069476e-07,
      "loss": 0.0002,
      "num_tokens": 89636600.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3255,
      "step_time": 18.050747897475958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 194.25,
      "completions/mean_terminated_length": 194.25,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3691105544567108,
      "epoch": 0.1508105604446503,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11865503340959549,
      "kl": 0.011823933571577072,
      "learning_rate": 9.698471514590087e-07,
      "loss": -0.073,
      "num_tokens": 89660844.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3256,
      "step_time": 21.661863792687654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 144.3125,
      "completions/mean_terminated_length": 144.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.33116084337234497,
      "epoch": 0.1508568781843446,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002101058140397072,
      "kl": 0.0017146758327726275,
      "learning_rate": 9.698378879110698e-07,
      "loss": 0.0001,
      "num_tokens": 89683569.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3257,
      "step_time": 15.987027879804373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 164.1875,
      "completions/mean_terminated_length": 164.1875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3729643002152443,
      "epoch": 0.15090319592403892,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003417692380025983,
      "kl": 0.0027359239174984396,
      "learning_rate": 9.698286243631312e-07,
      "loss": 0.0001,
      "num_tokens": 89704676.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3258,
      "step_time": 18.430900812149048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 158.25,
      "completions/mean_terminated_length": 158.25,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.36683958023786545,
      "epoch": 0.15094951366373321,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017151250503957272,
      "kl": 0.0015917146811261773,
      "learning_rate": 9.698193608151923e-07,
      "loss": 0.0001,
      "num_tokens": 89739240.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3259,
      "step_time": 19.48437863588333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 127.625,
      "completions/mean_terminated_length": 127.625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2848173752427101,
      "epoch": 0.1509958314034275,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004380743019282818,
      "kl": 0.0024106700730044395,
      "learning_rate": 9.698100972672532e-07,
      "loss": 0.0001,
      "num_tokens": 89761298.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3260,
      "step_time": 15.09330853447318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 125.125,
      "completions/mean_terminated_length": 125.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.26965858042240143,
      "epoch": 0.1510421491431218,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00806054100394249,
      "kl": 0.0024616795708425343,
      "learning_rate": 9.698008337193143e-07,
      "loss": 0.0001,
      "num_tokens": 89780644.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3261,
      "step_time": 13.78364922106266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 183.0625,
      "completions/mean_terminated_length": 183.0625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.34466879814863205,
      "epoch": 0.15108846688281613,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1152619943022728,
      "kl": 0.006931161391548812,
      "learning_rate": 9.697915701713757e-07,
      "loss": 0.0458,
      "num_tokens": 89802357.0,
      "reward": 0.5879805088043213,
      "reward_std": 0.47100555896759033,
      "rewards/reward_func/mean": 0.5879805088043213,
      "rewards/reward_func/std": 0.47100555896759033,
      "step": 3262,
      "step_time": 20.866017419844866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 306.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 277.875,
      "completions/mean_terminated_length": 277.875,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "entropy": 0.2506040036678314,
      "epoch": 0.15113478462251043,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002137587871402502,
      "kl": 0.0019003933412022889,
      "learning_rate": 9.697823066234368e-07,
      "loss": 0.0001,
      "num_tokens": 89827651.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3263,
      "step_time": 26.01487050577998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 157.5,
      "completions/mean_terminated_length": 157.5,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.36395641416311264,
      "epoch": 0.15118110236220472,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025934490840882063,
      "kl": 0.0018593795248307288,
      "learning_rate": 9.69773043075498e-07,
      "loss": 0.0001,
      "num_tokens": 89861851.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3264,
      "step_time": 19.28165887668729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 198.75,
      "completions/mean_terminated_length": 198.75,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.1678924560546875,
      "epoch": 0.15122742010189902,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014214112889021635,
      "kl": 0.0010792575048981234,
      "learning_rate": 9.69763779527559e-07,
      "loss": 0.0001,
      "num_tokens": 89884263.0,
      "reward": 0.22313016653060913,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.22313016653060913,
      "rewards/reward_func/std": 0.0,
      "step": 3265,
      "step_time": 20.242999769747257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 220.0625,
      "completions/mean_terminated_length": 220.0625,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.2644739933311939,
      "epoch": 0.15127373784159334,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10686413943767548,
      "kl": 0.005656387540511787,
      "learning_rate": 9.697545159796202e-07,
      "loss": 0.025,
      "num_tokens": 89909336.0,
      "reward": 0.8512634634971619,
      "reward_std": 0.09077872335910797,
      "rewards/reward_func/mean": 0.8512634634971619,
      "rewards/reward_func/std": 0.09077871590852737,
      "step": 3266,
      "step_time": 21.561144541949034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 130.75,
      "completions/mean_terminated_length": 130.75,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.23788397759199142,
      "epoch": 0.15132005558128764,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024266827385872602,
      "kl": 0.001750052470015362,
      "learning_rate": 9.697452524316813e-07,
      "loss": 0.0001,
      "num_tokens": 89928964.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3267,
      "step_time": 13.8215871155262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 223.1875,
      "completions/mean_terminated_length": 223.1875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.15739285573363304,
      "epoch": 0.15136637332098193,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08341161161661148,
      "kl": 0.0024393564090132713,
      "learning_rate": 9.697359888837424e-07,
      "loss": -0.0684,
      "num_tokens": 89960231.0,
      "reward": 0.9670228958129883,
      "reward_std": 0.1319083571434021,
      "rewards/reward_func/mean": 0.9670228958129883,
      "rewards/reward_func/std": 0.1319083571434021,
      "step": 3268,
      "step_time": 24.692576456815004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 123.1875,
      "completions/mean_terminated_length": 123.1875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.30562247335910797,
      "epoch": 0.15141269106067623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035384935326874256,
      "kl": 0.0020598951377905905,
      "learning_rate": 9.697267253358035e-07,
      "loss": 0.0001,
      "num_tokens": 89979770.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3269,
      "step_time": 15.709949240088463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 153.0,
      "completions/mean_terminated_length": 153.0,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.15506519004702568,
      "epoch": 0.15145900880037055,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2474597692489624,
      "kl": 0.03511208947747946,
      "learning_rate": 9.697174617878647e-07,
      "loss": -0.0631,
      "num_tokens": 90000378.0,
      "reward": 0.9016326665878296,
      "reward_std": 0.1759648323059082,
      "rewards/reward_func/mean": 0.9016326665878296,
      "rewards/reward_func/std": 0.1759648323059082,
      "step": 3270,
      "step_time": 15.967582158744335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 118.0625,
      "completions/mean_terminated_length": 118.0625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.3097205013036728,
      "epoch": 0.15150532654006485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005242596846073866,
      "kl": 0.002366037108004093,
      "learning_rate": 9.69708198239926e-07,
      "loss": 0.0001,
      "num_tokens": 90024683.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3271,
      "step_time": 13.729232251644135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 114.875,
      "completions/mean_terminated_length": 114.875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.30227434635162354,
      "epoch": 0.15155164427975915,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036587256472557783,
      "kl": 0.0023751078406348825,
      "learning_rate": 9.69698934691987e-07,
      "loss": 0.0001,
      "num_tokens": 90048393.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3272,
      "step_time": 13.636955201625824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 186.6875,
      "completions/mean_terminated_length": 186.6875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.39632876217365265,
      "epoch": 0.15159796201945344,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004219031892716885,
      "kl": 0.003482925007119775,
      "learning_rate": 9.69689671144048e-07,
      "loss": 0.0002,
      "num_tokens": 90077076.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3273,
      "step_time": 21.402074065059423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.25,
      "completions/mean_terminated_length": 179.25,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3429599553346634,
      "epoch": 0.15164427975914777,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10947351902723312,
      "kl": 0.0089666866697371,
      "learning_rate": 9.696804075961092e-07,
      "loss": 0.0115,
      "num_tokens": 90101240.0,
      "reward": 0.028005223721265793,
      "reward_std": 0.025508280843496323,
      "rewards/reward_func/mean": 0.028005223721265793,
      "rewards/reward_func/std": 0.025508280843496323,
      "step": 3274,
      "step_time": 19.090307485312223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 124.0,
      "completions/mean_terminated_length": 124.0,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.33106181770563126,
      "epoch": 0.15169059749884206,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00574125163257122,
      "kl": 0.0031234038760885596,
      "learning_rate": 9.696711440481705e-07,
      "loss": 0.0002,
      "num_tokens": 90123464.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3275,
      "step_time": 15.320256788283587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 210.0625,
      "completions/mean_terminated_length": 210.0625,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.2225816547870636,
      "epoch": 0.15173691523853636,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031187862623482943,
      "kl": 0.002612276584841311,
      "learning_rate": 9.696618805002316e-07,
      "loss": 0.0001,
      "num_tokens": 90156361.0,
      "reward": 0.5961628556251526,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5961628556251526,
      "rewards/reward_func/std": 0.0,
      "step": 3276,
      "step_time": 22.935048822313547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.20432621985673904,
      "epoch": 0.15178323297823065,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00388956256210804,
      "kl": 0.00211767345899716,
      "learning_rate": 9.696526169522927e-07,
      "loss": 0.0001,
      "num_tokens": 90178755.0,
      "reward": 0.3678794503211975,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3678794503211975,
      "rewards/reward_func/std": 0.0,
      "step": 3277,
      "step_time": 14.698649179190397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 151.125,
      "completions/mean_terminated_length": 151.125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2996611073613167,
      "epoch": 0.15182955071792498,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018953023245558143,
      "kl": 0.0019482370116747916,
      "learning_rate": 9.696433534043539e-07,
      "loss": 0.0001,
      "num_tokens": 90199861.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3278,
      "step_time": 16.812793001532555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 320.0,
      "completions/max_terminated_length": 320.0,
      "completions/mean_length": 260.9375,
      "completions/mean_terminated_length": 260.9375,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "entropy": 0.23603739216923714,
      "epoch": 0.15187586845761927,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09444378316402435,
      "kl": 0.011439332040026784,
      "learning_rate": 9.69634089856415e-07,
      "loss": -0.0012,
      "num_tokens": 90229716.0,
      "reward": 0.904184103012085,
      "reward_std": 0.0697307363152504,
      "rewards/reward_func/mean": 0.904184103012085,
      "rewards/reward_func/std": 0.0697307288646698,
      "step": 3279,
      "step_time": 27.659960947930813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 145.75,
      "completions/mean_terminated_length": 145.75,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.21466870605945587,
      "epoch": 0.15192218619731357,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035024394746869802,
      "kl": 0.001631989172892645,
      "learning_rate": 9.696248263084761e-07,
      "loss": 0.0001,
      "num_tokens": 90249664.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3280,
      "step_time": 15.02141348272562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 156.25,
      "completions/mean_terminated_length": 156.25,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.20678676292300224,
      "epoch": 0.15196850393700786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002799753565341234,
      "kl": 0.0014430058363359421,
      "learning_rate": 9.696155627605372e-07,
      "loss": 0.0001,
      "num_tokens": 90280148.0,
      "reward": 0.16957512497901917,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.16957512497901917,
      "rewards/reward_func/std": 0.0,
      "step": 3281,
      "step_time": 20.30757163465023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 155.1875,
      "completions/mean_terminated_length": 155.1875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.38377776741981506,
      "epoch": 0.1520148216767022,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0056123570539057255,
      "kl": 0.0035017322516068816,
      "learning_rate": 9.696062992125984e-07,
      "loss": 0.0002,
      "num_tokens": 90316999.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3282,
      "step_time": 22.22096023708582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 162.3125,
      "completions/mean_terminated_length": 162.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4039725139737129,
      "epoch": 0.15206113941639648,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027582065667957067,
      "kl": 0.0021585662325378507,
      "learning_rate": 9.695970356646595e-07,
      "loss": 0.0001,
      "num_tokens": 90344092.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3283,
      "step_time": 17.36678833886981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 147.5625,
      "completions/mean_terminated_length": 147.5625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3137865886092186,
      "epoch": 0.15210745715609078,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040509337559342384,
      "kl": 0.0022408179938793182,
      "learning_rate": 9.695877721167206e-07,
      "loss": 0.0001,
      "num_tokens": 90365445.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3284,
      "step_time": 15.23472861200571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 186.75,
      "completions/mean_terminated_length": 186.75,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.24157676845788956,
      "epoch": 0.15215377489578508,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09288583695888519,
      "kl": 0.0019232777995057404,
      "learning_rate": 9.695785085687817e-07,
      "loss": -0.0029,
      "num_tokens": 90387793.0,
      "reward": 0.9493370056152344,
      "reward_std": 0.013510131277143955,
      "rewards/reward_func/mean": 0.9493370056152344,
      "rewards/reward_func/std": 0.013510138727724552,
      "step": 3285,
      "step_time": 18.731104020029306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 189.9375,
      "completions/mean_terminated_length": 189.9375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.3699192479252815,
      "epoch": 0.1522000926354794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018917883280664682,
      "kl": 0.0021559084416367114,
      "learning_rate": 9.695692450208429e-07,
      "loss": 0.0001,
      "num_tokens": 90418464.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3286,
      "step_time": 21.72200494259596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 156.25,
      "completions/mean_terminated_length": 156.25,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.17947082966566086,
      "epoch": 0.1522464103751737,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14477550983428955,
      "kl": 0.0419952217489481,
      "learning_rate": 9.69559981472904e-07,
      "loss": -0.0652,
      "num_tokens": 90440436.0,
      "reward": 0.8929705619812012,
      "reward_std": 0.19395191967487335,
      "rewards/reward_func/mean": 0.8929705619812012,
      "rewards/reward_func/std": 0.19395191967487335,
      "step": 3287,
      "step_time": 16.9317224919796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 152.3125,
      "completions/mean_terminated_length": 152.3125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.25229161977767944,
      "epoch": 0.152292728114868,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14486804604530334,
      "kl": 0.0030481035355478525,
      "learning_rate": 9.695507179249653e-07,
      "loss": 0.0193,
      "num_tokens": 90460969.0,
      "reward": 0.862541675567627,
      "reward_std": 0.23001109063625336,
      "rewards/reward_func/mean": 0.862541675567627,
      "rewards/reward_func/std": 0.23001112043857574,
      "step": 3288,
      "step_time": 16.284332536160946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 168.5,
      "completions/mean_terminated_length": 168.5,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.370268777012825,
      "epoch": 0.1523390458545623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008434325456619263,
      "kl": 0.004034133744426072,
      "learning_rate": 9.695414543770265e-07,
      "loss": 0.0002,
      "num_tokens": 90485393.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3289,
      "step_time": 19.006528720259666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 175.3125,
      "completions/mean_terminated_length": 175.3125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.23728742450475693,
      "epoch": 0.1523853635942566,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019843794871121645,
      "kl": 0.0016110965516418219,
      "learning_rate": 9.695321908290876e-07,
      "loss": 0.0001,
      "num_tokens": 90507926.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3290,
      "step_time": 18.091393880546093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 205.875,
      "completions/mean_terminated_length": 205.875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.3134409636259079,
      "epoch": 0.1524316813339509,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12602339684963226,
      "kl": 0.007899310905486345,
      "learning_rate": 9.695229272811485e-07,
      "loss": 0.0273,
      "num_tokens": 90530164.0,
      "reward": 0.9767484664916992,
      "reward_std": 0.024014079943299294,
      "rewards/reward_func/mean": 0.9767484664916992,
      "rewards/reward_func/std": 0.024014081805944443,
      "step": 3291,
      "step_time": 21.685513395816088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 188.5,
      "completions/mean_terminated_length": 188.5,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.43275701254606247,
      "epoch": 0.1524779990736452,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006604908034205437,
      "kl": 0.003985709452535957,
      "learning_rate": 9.695136637332098e-07,
      "loss": 0.0002,
      "num_tokens": 90568556.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3292,
      "step_time": 24.167351059615612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 168.5,
      "completions/mean_terminated_length": 168.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.35880517959594727,
      "epoch": 0.1525243168133395,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0072023323737084866,
      "kl": 0.004372330091428012,
      "learning_rate": 9.69504400185271e-07,
      "loss": 0.0002,
      "num_tokens": 90618884.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3293,
      "step_time": 25.151807714253664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 148.5,
      "completions/mean_terminated_length": 148.5,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.15271759033203125,
      "epoch": 0.15257063455303382,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001755103818140924,
      "kl": 0.0013450997066684067,
      "learning_rate": 9.69495136637332e-07,
      "loss": 0.0001,
      "num_tokens": 90649212.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3294,
      "step_time": 17.78191427141428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 124.0625,
      "completions/mean_terminated_length": 124.0625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2932748645544052,
      "epoch": 0.15261695229272812,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003972919657826424,
      "kl": 0.0026993892388418317,
      "learning_rate": 9.694858730893932e-07,
      "loss": 0.0001,
      "num_tokens": 90669405.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3295,
      "step_time": 14.937408167868853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 171.8125,
      "completions/mean_terminated_length": 171.8125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.35521989315748215,
      "epoch": 0.15266327003242242,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015062614344060421,
      "kl": 0.01492274587508291,
      "learning_rate": 9.694766095414543e-07,
      "loss": 0.0008,
      "num_tokens": 90690890.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3296,
      "step_time": 22.135592482984066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 222.125,
      "completions/mean_terminated_length": 222.125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3670378103852272,
      "epoch": 0.1527095877721167,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11439114063978195,
      "kl": 0.0034319759579375386,
      "learning_rate": 9.694673459935155e-07,
      "loss": 0.0026,
      "num_tokens": 90728652.0,
      "reward": 0.05706879496574402,
      "reward_std": 0.22827517986297607,
      "rewards/reward_func/mean": 0.05706879496574402,
      "rewards/reward_func/std": 0.22827517986297607,
      "step": 3297,
      "step_time": 34.143172804266214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 154.0,
      "completions/mean_terminated_length": 154.0,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.32381830364465714,
      "epoch": 0.15275590551181104,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006391808856278658,
      "kl": 0.00352169742109254,
      "learning_rate": 9.694580824455766e-07,
      "loss": 0.0002,
      "num_tokens": 90749084.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3298,
      "step_time": 16.812274515628815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 135.625,
      "completions/mean_terminated_length": 135.625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2904447764158249,
      "epoch": 0.15280222325150533,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005151164252310991,
      "kl": 0.0025714096846058965,
      "learning_rate": 9.694488188976377e-07,
      "loss": 0.0001,
      "num_tokens": 90771702.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3299,
      "step_time": 15.857111155986786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 200.625,
      "completions/mean_terminated_length": 200.625,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.19473102688789368,
      "epoch": 0.15284854099119963,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1317998468875885,
      "kl": 0.00478421151638031,
      "learning_rate": 9.694395553496988e-07,
      "loss": -0.0416,
      "num_tokens": 90795616.0,
      "reward": 0.09923243522644043,
      "reward_std": 0.012309839949011803,
      "rewards/reward_func/mean": 0.09923243522644043,
      "rewards/reward_func/std": 0.012309840880334377,
      "step": 3300,
      "step_time": 20.982359372079372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 128.875,
      "completions/mean_terminated_length": 128.875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.26331282407045364,
      "epoch": 0.15289485873089392,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022739721462130547,
      "kl": 0.001985971728572622,
      "learning_rate": 9.694302918017602e-07,
      "loss": 0.0001,
      "num_tokens": 90815454.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3301,
      "step_time": 14.306706611067057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 145.125,
      "completions/mean_terminated_length": 145.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.23532408848404884,
      "epoch": 0.15294117647058825,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006874697748571634,
      "kl": 0.0036359603982418776,
      "learning_rate": 9.694210282538213e-07,
      "loss": 0.0002,
      "num_tokens": 90838928.0,
      "reward": 0.8633400201797485,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8633400201797485,
      "rewards/reward_func/std": 0.0,
      "step": 3302,
      "step_time": 15.767543531954288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 174.875,
      "completions/mean_terminated_length": 174.875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.4035678803920746,
      "epoch": 0.15298749421028254,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030451833736151457,
      "kl": 0.003345791425090283,
      "learning_rate": 9.694117647058822e-07,
      "loss": 0.0002,
      "num_tokens": 90873550.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3303,
      "step_time": 20.6306994818151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 123.5,
      "completions/mean_terminated_length": 123.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.28306350857019424,
      "epoch": 0.15303381194997684,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00297046871855855,
      "kl": 0.002029210823820904,
      "learning_rate": 9.694025011579433e-07,
      "loss": 0.0001,
      "num_tokens": 90898550.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3304,
      "step_time": 14.078998416662216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 158.6875,
      "completions/mean_terminated_length": 158.6875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.377525232732296,
      "epoch": 0.15308012968967113,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00698324479162693,
      "kl": 0.002947824017610401,
      "learning_rate": 9.693932376100047e-07,
      "loss": 0.0001,
      "num_tokens": 90953521.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3305,
      "step_time": 24.537496775388718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 126.125,
      "completions/mean_terminated_length": 126.125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.30835408717393875,
      "epoch": 0.15312644742936546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009671625681221485,
      "kl": 0.004571863682940602,
      "learning_rate": 9.693839740620658e-07,
      "loss": 0.0002,
      "num_tokens": 90973779.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3306,
      "step_time": 13.879542093724012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 124.375,
      "completions/mean_terminated_length": 124.375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.301509328186512,
      "epoch": 0.15317276516905975,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017688415246084332,
      "kl": 0.001505364547483623,
      "learning_rate": 9.69374710514127e-07,
      "loss": 0.0001,
      "num_tokens": 91001609.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3307,
      "step_time": 15.010881319642067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3417460024356842,
      "epoch": 0.15321908290875405,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011428617872297764,
      "kl": 0.006048428127542138,
      "learning_rate": 9.69365446966188e-07,
      "loss": 0.0003,
      "num_tokens": 91028219.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3308,
      "step_time": 21.835330188274384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 269.0,
      "completions/max_terminated_length": 269.0,
      "completions/mean_length": 229.9375,
      "completions/mean_terminated_length": 229.9375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.21667520701885223,
      "epoch": 0.15326540064844835,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07177912443876266,
      "kl": 0.004559624299872667,
      "learning_rate": 9.693561834182492e-07,
      "loss": -0.0447,
      "num_tokens": 91052026.0,
      "reward": 0.013111528009176254,
      "reward_std": 0.004381328821182251,
      "rewards/reward_func/mean": 0.013111528009176254,
      "rewards/reward_func/std": 0.004381329286843538,
      "step": 3309,
      "step_time": 23.080633092671633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 150.0625,
      "completions/mean_terminated_length": 150.0625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.1849103607237339,
      "epoch": 0.15331171838814267,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003997376654297113,
      "kl": 0.0022924855584278703,
      "learning_rate": 9.693469198703103e-07,
      "loss": 0.0001,
      "num_tokens": 91072955.0,
      "reward": 0.07200431823730469,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.07200431823730469,
      "rewards/reward_func/std": 0.0,
      "step": 3310,
      "step_time": 16.4729915112257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 209.4375,
      "completions/mean_terminated_length": 209.4375,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.3776468113064766,
      "epoch": 0.15335803612783697,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1089252382516861,
      "kl": 0.009051568806171417,
      "learning_rate": 9.693376563223714e-07,
      "loss": -0.1148,
      "num_tokens": 91101362.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3311,
      "step_time": 24.408411759883165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 192.0625,
      "completions/mean_terminated_length": 192.0625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.40296033024787903,
      "epoch": 0.15340435386753126,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12542779743671417,
      "kl": 0.006457364303059876,
      "learning_rate": 9.693283927744325e-07,
      "loss": -0.0625,
      "num_tokens": 91122739.0,
      "reward": 0.2542293667793274,
      "reward_std": 0.3926730751991272,
      "rewards/reward_func/mean": 0.2542293667793274,
      "rewards/reward_func/std": 0.3926730453968048,
      "step": 3312,
      "step_time": 20.91408522054553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 183.5,
      "completions/mean_terminated_length": 183.5,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.11455479264259338,
      "epoch": 0.15345067160722556,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020320117473602295,
      "kl": 0.0009717029897728935,
      "learning_rate": 9.693191292264937e-07,
      "loss": 0.0,
      "num_tokens": 91152939.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 3313,
      "step_time": 19.73755782842636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 241.8125,
      "completions/mean_terminated_length": 241.8125,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.30763477832078934,
      "epoch": 0.15349698934691988,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07847141474485397,
      "kl": 0.004074239986948669,
      "learning_rate": 9.693098656785548e-07,
      "loss": -0.0533,
      "num_tokens": 91191896.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3314,
      "step_time": 28.239569298923016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 186.4375,
      "completions/mean_terminated_length": 186.4375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3589971140027046,
      "epoch": 0.15354330708661418,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10690788179636002,
      "kl": 0.00506272166967392,
      "learning_rate": 9.69300602130616e-07,
      "loss": -0.0265,
      "num_tokens": 91213151.0,
      "reward": 0.7459019422531128,
      "reward_std": 0.2996750473976135,
      "rewards/reward_func/mean": 0.7459019422531128,
      "rewards/reward_func/std": 0.2996750771999359,
      "step": 3315,
      "step_time": 21.345596093684435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 139.375,
      "completions/mean_terminated_length": 139.375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.20879939943552017,
      "epoch": 0.15358962482630847,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035100355744361877,
      "kl": 0.0022340710274875164,
      "learning_rate": 9.69291338582677e-07,
      "loss": 0.0001,
      "num_tokens": 91232821.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3316,
      "step_time": 14.858672991394997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 121.9375,
      "completions/mean_terminated_length": 121.9375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.26160337403416634,
      "epoch": 0.15363594256600277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002126466017216444,
      "kl": 0.0017897105135489255,
      "learning_rate": 9.692820750347382e-07,
      "loss": 0.0001,
      "num_tokens": 91256756.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3317,
      "step_time": 13.929788701236248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 208.125,
      "completions/mean_terminated_length": 208.125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.3657693639397621,
      "epoch": 0.1536822603056971,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008878033608198166,
      "kl": 0.0043170658173039556,
      "learning_rate": 9.692728114867995e-07,
      "loss": 0.0002,
      "num_tokens": 91289782.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3318,
      "step_time": 24.704879105091095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 122.0625,
      "completions/mean_terminated_length": 122.0625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2652072086930275,
      "epoch": 0.1537285780453914,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001954125240445137,
      "kl": 0.0016342448652721941,
      "learning_rate": 9.692635479388606e-07,
      "loss": 0.0001,
      "num_tokens": 91309847.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3319,
      "step_time": 13.324027381837368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 122.5625,
      "completions/mean_terminated_length": 122.5625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.24706030637025833,
      "epoch": 0.15377489578508569,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005342193879187107,
      "kl": 0.0023952096817083657,
      "learning_rate": 9.692542843909217e-07,
      "loss": 0.0001,
      "num_tokens": 91332304.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3320,
      "step_time": 14.820265386253595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 192.3125,
      "completions/mean_terminated_length": 192.3125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4223659113049507,
      "epoch": 0.15382121352477998,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09434162825345993,
      "kl": 0.005028395331464708,
      "learning_rate": 9.692450208429827e-07,
      "loss": -0.0106,
      "num_tokens": 91358965.0,
      "reward": 0.11413758993148804,
      "reward_std": 0.31188327074050903,
      "rewards/reward_func/mean": 0.11413758993148804,
      "rewards/reward_func/std": 0.31188327074050903,
      "step": 3321,
      "step_time": 22.400970544666052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 168.25,
      "completions/mean_terminated_length": 168.25,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.3815314769744873,
      "epoch": 0.1538675312644743,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002653124975040555,
      "kl": 0.002448896935675293,
      "learning_rate": 9.69235757295044e-07,
      "loss": 0.0001,
      "num_tokens": 91393145.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3322,
      "step_time": 20.7161478176713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 164.4375,
      "completions/mean_terminated_length": 164.4375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.18568189442157745,
      "epoch": 0.1539138490041686,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013142157113179564,
      "kl": 0.0011784150556195527,
      "learning_rate": 9.692264937471051e-07,
      "loss": 0.0001,
      "num_tokens": 91419232.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3323,
      "step_time": 17.862348187714815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 143.5,
      "completions/mean_terminated_length": 143.5,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3101409822702408,
      "epoch": 0.1539601667438629,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006444879807531834,
      "kl": 0.003332747903186828,
      "learning_rate": 9.692172301991662e-07,
      "loss": 0.0002,
      "num_tokens": 91455528.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3324,
      "step_time": 19.23028664290905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 163.25,
      "completions/mean_terminated_length": 163.25,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.36022551357746124,
      "epoch": 0.1540064844835572,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030863964930176735,
      "kl": 0.002290126052685082,
      "learning_rate": 9.692079666512274e-07,
      "loss": 0.0001,
      "num_tokens": 91478396.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3325,
      "step_time": 17.516544554382563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 135.5,
      "completions/mean_terminated_length": 135.5,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3505780100822449,
      "epoch": 0.15405280222325152,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032824836671352386,
      "kl": 0.002519895788282156,
      "learning_rate": 9.691987031032885e-07,
      "loss": 0.0001,
      "num_tokens": 91502740.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3326,
      "step_time": 15.37037943303585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.20433121547102928,
      "epoch": 0.1540991199629458,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11700944602489471,
      "kl": 0.0021943735773675144,
      "learning_rate": 9.691894395553496e-07,
      "loss": -0.0051,
      "num_tokens": 91530889.0,
      "reward": 0.24393409490585327,
      "reward_std": 0.036680784076452255,
      "rewards/reward_func/mean": 0.24393409490585327,
      "rewards/reward_func/std": 0.036680784076452255,
      "step": 3327,
      "step_time": 18.378038570284843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 107.375,
      "completions/mean_terminated_length": 107.375,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2943278029561043,
      "epoch": 0.1541454377026401,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014756934251636267,
      "kl": 0.0014031844912096858,
      "learning_rate": 9.691801760074107e-07,
      "loss": 0.0001,
      "num_tokens": 91551919.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3328,
      "step_time": 12.465799763798714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 170.75,
      "completions/mean_terminated_length": 170.75,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.15385310351848602,
      "epoch": 0.1541917554423344,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08098611235618591,
      "kl": 0.0012751940521411598,
      "learning_rate": 9.691709124594719e-07,
      "loss": -0.0005,
      "num_tokens": 91583403.0,
      "reward": 0.9881047010421753,
      "reward_std": 0.03250421583652496,
      "rewards/reward_func/mean": 0.9881047010421753,
      "rewards/reward_func/std": 0.03250420466065407,
      "step": 3329,
      "step_time": 19.58171332255006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 157.3125,
      "completions/mean_terminated_length": 157.3125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.32820216566324234,
      "epoch": 0.15423807318202873,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007466540671885014,
      "kl": 0.005166945746168494,
      "learning_rate": 9.69161648911533e-07,
      "loss": 0.0003,
      "num_tokens": 91606672.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3330,
      "step_time": 17.079447399824858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 184.625,
      "completions/mean_terminated_length": 184.625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3910089060664177,
      "epoch": 0.15428439092172302,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16182798147201538,
      "kl": 0.010804095072671771,
      "learning_rate": 9.691523853635941e-07,
      "loss": -0.0352,
      "num_tokens": 91636666.0,
      "reward": 0.05592745915055275,
      "reward_std": 0.223709836602211,
      "rewards/reward_func/mean": 0.05592745915055275,
      "rewards/reward_func/std": 0.223709836602211,
      "step": 3331,
      "step_time": 22.886167015880346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 134.5625,
      "completions/mean_terminated_length": 134.5625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3120100200176239,
      "epoch": 0.15433070866141732,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002287524752318859,
      "kl": 0.002078368212096393,
      "learning_rate": 9.691431218156555e-07,
      "loss": 0.0001,
      "num_tokens": 91656643.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3332,
      "step_time": 14.359949983656406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 158.0625,
      "completions/mean_terminated_length": 158.0625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.257740281522274,
      "epoch": 0.15437702640111162,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015757882501929998,
      "kl": 0.001483880274463445,
      "learning_rate": 9.691338582677166e-07,
      "loss": 0.0001,
      "num_tokens": 91691124.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 3333,
      "step_time": 21.722475323826075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 144.1875,
      "completions/mean_terminated_length": 144.1875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.24944373592734337,
      "epoch": 0.15442334414080594,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005230751354247332,
      "kl": 0.0025434315903112292,
      "learning_rate": 9.691245947197775e-07,
      "loss": 0.0001,
      "num_tokens": 91710839.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3334,
      "step_time": 18.03704598918557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 191.6875,
      "completions/mean_terminated_length": 191.6875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.21327754482626915,
      "epoch": 0.15446966188050024,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1796390861272812,
      "kl": 0.006803740747272968,
      "learning_rate": 9.691153311718388e-07,
      "loss": -0.0324,
      "num_tokens": 91735986.0,
      "reward": 0.03723296523094177,
      "reward_std": 0.00694130826741457,
      "rewards/reward_func/mean": 0.03723296523094177,
      "rewards/reward_func/std": 0.0069413078017532825,
      "step": 3335,
      "step_time": 19.53165503963828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 151.0,
      "completions/mean_terminated_length": 151.0,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.25464953109622,
      "epoch": 0.15451597962019453,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003500354941934347,
      "kl": 0.002710888395085931,
      "learning_rate": 9.691060676239e-07,
      "loss": 0.0001,
      "num_tokens": 91755970.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3336,
      "step_time": 15.854786850512028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 136.0625,
      "completions/mean_terminated_length": 136.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.31221120059490204,
      "epoch": 0.15456229735988883,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001925988937728107,
      "kl": 0.001990779914194718,
      "learning_rate": 9.69096804075961e-07,
      "loss": 0.0001,
      "num_tokens": 91780451.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3337,
      "step_time": 15.589721571654081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 152.1875,
      "completions/mean_terminated_length": 152.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.23425745218992233,
      "epoch": 0.15460861509958315,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.172028586268425,
      "kl": 0.00620538042858243,
      "learning_rate": 9.690875405280222e-07,
      "loss": -0.0734,
      "num_tokens": 91806550.0,
      "reward": 0.8298039436340332,
      "reward_std": 0.24870947003364563,
      "rewards/reward_func/mean": 0.8298039436340332,
      "rewards/reward_func/std": 0.24870947003364563,
      "step": 3338,
      "step_time": 17.276704136282206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 284.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 177.875,
      "completions/mean_terminated_length": 177.875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.2940092608332634,
      "epoch": 0.15465493283927745,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11028830707073212,
      "kl": 0.008754837443120778,
      "learning_rate": 9.690782769800833e-07,
      "loss": -0.0168,
      "num_tokens": 91828004.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3339,
      "step_time": 23.81776424124837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 154.75,
      "completions/mean_terminated_length": 154.75,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3867867439985275,
      "epoch": 0.15470125057897174,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004322863183915615,
      "kl": 0.0028193810721859336,
      "learning_rate": 9.690690134321445e-07,
      "loss": 0.0001,
      "num_tokens": 91876096.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3340,
      "step_time": 22.542425610125065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 168.0,
      "completions/mean_terminated_length": 168.0,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.17907460778951645,
      "epoch": 0.15474756831866604,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033703304361552,
      "kl": 0.0020329627150204033,
      "learning_rate": 9.690597498842056e-07,
      "loss": 0.0001,
      "num_tokens": 91899808.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3341,
      "step_time": 16.980496268719435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.36107001453638077,
      "epoch": 0.15479388605836036,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028498759493231773,
      "kl": 0.0025219633826054633,
      "learning_rate": 9.690504863362667e-07,
      "loss": 0.0001,
      "num_tokens": 91947789.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3342,
      "step_time": 23.340466152876616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 193.0625,
      "completions/mean_terminated_length": 193.0625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.35160113871097565,
      "epoch": 0.15484020379805466,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1251196712255478,
      "kl": 0.0075583066791296005,
      "learning_rate": 9.690412227883278e-07,
      "loss": 0.0265,
      "num_tokens": 91980734.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 3343,
      "step_time": 23.79976823925972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 187.5625,
      "completions/mean_terminated_length": 187.5625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.14454735442996025,
      "epoch": 0.15488652153774896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022063462529331446,
      "kl": 0.001651369733735919,
      "learning_rate": 9.69031959240389e-07,
      "loss": 0.0001,
      "num_tokens": 92018247.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3344,
      "step_time": 21.65063364431262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 137.25,
      "completions/mean_terminated_length": 137.25,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2657321095466614,
      "epoch": 0.15493283927744325,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007140035275369883,
      "kl": 0.003754403966013342,
      "learning_rate": 9.690226956924503e-07,
      "loss": 0.0002,
      "num_tokens": 92038075.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3345,
      "step_time": 14.567701142281294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 143.125,
      "completions/mean_terminated_length": 143.125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.1490999199450016,
      "epoch": 0.15497915701713758,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1289130598306656,
      "kl": 0.0032111339969560504,
      "learning_rate": 9.690134321445112e-07,
      "loss": -0.0107,
      "num_tokens": 92064477.0,
      "reward": 0.007286164443939924,
      "reward_std": 0.005187314469367266,
      "rewards/reward_func/mean": 0.007286164443939924,
      "rewards/reward_func/std": 0.005187314003705978,
      "step": 3346,
      "step_time": 19.316949263215065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 145.9375,
      "completions/mean_terminated_length": 145.9375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.31513993442058563,
      "epoch": 0.15502547475683187,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01598193123936653,
      "kl": 0.005524818319827318,
      "learning_rate": 9.690041685965723e-07,
      "loss": 0.0003,
      "num_tokens": 92087068.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3347,
      "step_time": 16.297446753829718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 191.3125,
      "completions/mean_terminated_length": 191.3125,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.14111372828483582,
      "epoch": 0.15507179249652617,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004341846331954002,
      "kl": 0.0023714601702522486,
      "learning_rate": 9.689949050486337e-07,
      "loss": 0.0001,
      "num_tokens": 92108785.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3348,
      "step_time": 18.504158172756433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 140.375,
      "completions/mean_terminated_length": 140.375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3126991540193558,
      "epoch": 0.15511811023622046,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004649960435926914,
      "kl": 0.0031018926529213786,
      "learning_rate": 9.689856415006948e-07,
      "loss": 0.0002,
      "num_tokens": 92133815.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3349,
      "step_time": 16.528464261442423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 297.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 216.6875,
      "completions/mean_terminated_length": 216.6875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3769342973828316,
      "epoch": 0.1551644279759148,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11284460872411728,
      "kl": 0.013480915687978268,
      "learning_rate": 9.68976377952756e-07,
      "loss": -0.0919,
      "num_tokens": 92164290.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 3350,
      "step_time": 26.06608099862933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.0,
      "completions/max_terminated_length": 123.0,
      "completions/mean_length": 113.4375,
      "completions/mean_terminated_length": 113.4375,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.26655734330415726,
      "epoch": 0.15521074571560908,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002856267848983407,
      "kl": 0.0017867960850708187,
      "learning_rate": 9.68967114404817e-07,
      "loss": 0.0001,
      "num_tokens": 92183737.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3351,
      "step_time": 12.166086815297604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 218.125,
      "completions/mean_terminated_length": 218.125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3527267277240753,
      "epoch": 0.15525706345530338,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1178298145532608,
      "kl": 0.021648069377988577,
      "learning_rate": 9.689578508568782e-07,
      "loss": -0.1421,
      "num_tokens": 92208651.0,
      "reward": 0.3111305236816406,
      "reward_std": 0.3643587827682495,
      "rewards/reward_func/mean": 0.3111305236816406,
      "rewards/reward_func/std": 0.3643587529659271,
      "step": 3352,
      "step_time": 25.933844342827797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 199.8125,
      "completions/mean_terminated_length": 199.8125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.3849535584449768,
      "epoch": 0.15530338119499768,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10025057196617126,
      "kl": 0.004247734148520976,
      "learning_rate": 9.689485873089393e-07,
      "loss": -0.0031,
      "num_tokens": 92236280.0,
      "reward": 0.10031722486019135,
      "reward_std": 0.06985194981098175,
      "rewards/reward_func/mean": 0.10031722486019135,
      "rewards/reward_func/std": 0.06985194981098175,
      "step": 3353,
      "step_time": 22.336097571998835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 190.0625,
      "completions/mean_terminated_length": 190.0625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.21971788629889488,
      "epoch": 0.155349698934692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032453506719321012,
      "kl": 0.0023439573124051094,
      "learning_rate": 9.689393237610004e-07,
      "loss": 0.0001,
      "num_tokens": 92273417.0,
      "reward": 0.9067110419273376,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9067110419273376,
      "rewards/reward_func/std": 0.0,
      "step": 3354,
      "step_time": 21.85452165454626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 172.4375,
      "completions/mean_terminated_length": 172.4375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.23878859728574753,
      "epoch": 0.1553960166743863,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037063290365040302,
      "kl": 0.003366717486642301,
      "learning_rate": 9.689300602130615e-07,
      "loss": 0.0002,
      "num_tokens": 92298768.0,
      "reward": 0.7187313437461853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7187313437461853,
      "rewards/reward_func/std": 0.0,
      "step": 3355,
      "step_time": 20.31452167034149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 153.0,
      "completions/mean_terminated_length": 153.0,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.24774175509810448,
      "epoch": 0.1554423344140806,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015400012023746967,
      "kl": 0.019052762538194656,
      "learning_rate": 9.689207966651227e-07,
      "loss": 0.0009,
      "num_tokens": 92319456.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3356,
      "step_time": 16.624549858272076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 170.6875,
      "completions/mean_terminated_length": 170.6875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.22818206995725632,
      "epoch": 0.1554886521537749,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18956658244132996,
      "kl": 0.011238167993724346,
      "learning_rate": 9.689115331171838e-07,
      "loss": 0.1641,
      "num_tokens": 92341003.0,
      "reward": 0.8600144386291504,
      "reward_std": 0.1000090166926384,
      "rewards/reward_func/mean": 0.8600144386291504,
      "rewards/reward_func/std": 0.100009024143219,
      "step": 3357,
      "step_time": 22.292827654629946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 120.875,
      "completions/mean_terminated_length": 120.875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.26849503442645073,
      "epoch": 0.1555349698934692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002744281431660056,
      "kl": 0.002132963534677401,
      "learning_rate": 9.68902269569245e-07,
      "loss": 0.0001,
      "num_tokens": 92362841.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3358,
      "step_time": 13.914867967367172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 119.75,
      "completions/mean_terminated_length": 119.75,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.227818064391613,
      "epoch": 0.1555812876331635,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036486228927969933,
      "kl": 0.002492701372830197,
      "learning_rate": 9.68893006021306e-07,
      "loss": 0.0001,
      "num_tokens": 92382037.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3359,
      "step_time": 13.963378340005875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 186.5,
      "completions/mean_terminated_length": 186.5,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.3525514081120491,
      "epoch": 0.1556276053728578,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09065493196249008,
      "kl": 0.00845847011078149,
      "learning_rate": 9.688837424733672e-07,
      "loss": 0.0064,
      "num_tokens": 92406925.0,
      "reward": 0.13488443195819855,
      "reward_std": 0.10790754109621048,
      "rewards/reward_func/mean": 0.13488443195819855,
      "rewards/reward_func/std": 0.10790754854679108,
      "step": 3360,
      "step_time": 21.681909650564194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 208.625,
      "completions/mean_terminated_length": 208.625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.2752373516559601,
      "epoch": 0.1556739231125521,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11485565453767776,
      "kl": 0.01365563995204866,
      "learning_rate": 9.688744789254283e-07,
      "loss": -0.082,
      "num_tokens": 92429991.0,
      "reward": 0.3644893765449524,
      "reward_std": 0.2041807323694229,
      "rewards/reward_func/mean": 0.3644893765449524,
      "rewards/reward_func/std": 0.2041807323694229,
      "step": 3361,
      "step_time": 25.30739837139845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2848980873823166,
      "epoch": 0.15572024085224642,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031788686756044626,
      "kl": 0.0022560322540812194,
      "learning_rate": 9.688652153774896e-07,
      "loss": 0.0001,
      "num_tokens": 92453291.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3362,
      "step_time": 14.625582505017519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 124.125,
      "completions/mean_terminated_length": 124.125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.279193215072155,
      "epoch": 0.15576655859194072,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003246487583965063,
      "kl": 0.002011729695368558,
      "learning_rate": 9.688559518295508e-07,
      "loss": 0.0001,
      "num_tokens": 92472845.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3363,
      "step_time": 13.783062070608139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 142.125,
      "completions/mean_terminated_length": 142.125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.17071790620684624,
      "epoch": 0.15581287633163501,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028183585964143276,
      "kl": 0.0019790363148786128,
      "learning_rate": 9.688466882816119e-07,
      "loss": 0.0001,
      "num_tokens": 92501391.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 3364,
      "step_time": 17.18678993731737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 136.625,
      "completions/mean_terminated_length": 136.625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.29390769824385643,
      "epoch": 0.1558591940713293,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022444038186222315,
      "kl": 0.0019568908028304577,
      "learning_rate": 9.68837424733673e-07,
      "loss": 0.0001,
      "num_tokens": 92523801.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3365,
      "step_time": 16.590839076787233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 199.1875,
      "completions/mean_terminated_length": 199.1875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.3531302735209465,
      "epoch": 0.15590551181102363,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007046143990010023,
      "kl": 0.005330355372279882,
      "learning_rate": 9.688281611857341e-07,
      "loss": 0.0003,
      "num_tokens": 92547644.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3366,
      "step_time": 20.67684406414628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 207.8125,
      "completions/mean_terminated_length": 207.8125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.17195185646414757,
      "epoch": 0.15595182955071793,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11639894545078278,
      "kl": 0.0011754246370401233,
      "learning_rate": 9.688188976377953e-07,
      "loss": 0.0158,
      "num_tokens": 92592265.0,
      "reward": 0.8732266426086426,
      "reward_std": 0.0017810834106057882,
      "rewards/reward_func/mean": 0.8732266426086426,
      "rewards/reward_func/std": 0.0017810744466260076,
      "step": 3367,
      "step_time": 25.887058943510056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 161.5625,
      "completions/mean_terminated_length": 161.5625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.1910524144768715,
      "epoch": 0.15599814729041223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002463321899995208,
      "kl": 0.0017776959284674376,
      "learning_rate": 9.688096340898564e-07,
      "loss": 0.0001,
      "num_tokens": 92614322.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 3368,
      "step_time": 16.187005519866943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 140.3125,
      "completions/mean_terminated_length": 140.3125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.16739752888679504,
      "epoch": 0.15604446503010652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015062983147799969,
      "kl": 0.0015503333561355248,
      "learning_rate": 9.688003705419175e-07,
      "loss": 0.0001,
      "num_tokens": 92635127.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 3369,
      "step_time": 15.248216327279806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 114.25,
      "completions/mean_terminated_length": 114.25,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2350492924451828,
      "epoch": 0.15609078276980085,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003463833360001445,
      "kl": 0.0016782145539764315,
      "learning_rate": 9.687911069939786e-07,
      "loss": 0.0001,
      "num_tokens": 92654411.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3370,
      "step_time": 12.283231306821108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 121.125,
      "completions/mean_terminated_length": 121.125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.22284315899014473,
      "epoch": 0.15613710050949514,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016951520228758454,
      "kl": 0.0011517916864249855,
      "learning_rate": 9.687818434460398e-07,
      "loss": 0.0001,
      "num_tokens": 92675917.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3371,
      "step_time": 13.068330138921738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 135.6875,
      "completions/mean_terminated_length": 135.6875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.4059174954891205,
      "epoch": 0.15618341824918944,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024354124907404184,
      "kl": 0.0021466552861966193,
      "learning_rate": 9.687725798981009e-07,
      "loss": 0.0001,
      "num_tokens": 92718568.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3372,
      "step_time": 20.215550310909748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 157.625,
      "completions/mean_terminated_length": 157.625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.2768791392445564,
      "epoch": 0.15622973598888373,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00909844134002924,
      "kl": 0.010500472038984299,
      "learning_rate": 9.68763316350162e-07,
      "loss": 0.0005,
      "num_tokens": 92744754.0,
      "reward": 0.11603700369596481,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11603700369596481,
      "rewards/reward_func/std": 0.0,
      "step": 3373,
      "step_time": 18.666076958179474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 169.1875,
      "completions/mean_terminated_length": 169.1875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.3718565031886101,
      "epoch": 0.15627605372857806,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00507784727960825,
      "kl": 0.0032935781637206674,
      "learning_rate": 9.687540528022231e-07,
      "loss": 0.0002,
      "num_tokens": 92772725.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3374,
      "step_time": 18.568583589047194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 126.375,
      "completions/mean_terminated_length": 126.375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.29444124549627304,
      "epoch": 0.15632237146827235,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004532559309154749,
      "kl": 0.00210694590350613,
      "learning_rate": 9.687447892542845e-07,
      "loss": 0.0001,
      "num_tokens": 92796075.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3375,
      "step_time": 14.34216309338808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 222.875,
      "completions/mean_terminated_length": 222.875,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.3261827826499939,
      "epoch": 0.15636868920796665,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07907669246196747,
      "kl": 0.006121080252341926,
      "learning_rate": 9.687355257063456e-07,
      "loss": 0.0348,
      "num_tokens": 92824473.0,
      "reward": 0.7797904014587402,
      "reward_std": 0.23554649949073792,
      "rewards/reward_func/mean": 0.7797904014587402,
      "rewards/reward_func/std": 0.23554649949073792,
      "step": 3376,
      "step_time": 24.10123337060213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2709270790219307,
      "epoch": 0.15641500694766095,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011515576392412186,
      "kl": 0.00539400038542226,
      "learning_rate": 9.687262621584065e-07,
      "loss": 0.0003,
      "num_tokens": 92844013.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3377,
      "step_time": 14.51206111907959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 189.5,
      "completions/mean_terminated_length": 189.5,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.1876688115298748,
      "epoch": 0.15646132468735527,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001437133178114891,
      "kl": 0.0011671070824377239,
      "learning_rate": 9.687169986104678e-07,
      "loss": 0.0001,
      "num_tokens": 92876693.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3378,
      "step_time": 22.443644117563963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 172.875,
      "completions/mean_terminated_length": 172.875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.36684370785951614,
      "epoch": 0.15650764242704956,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14057622849941254,
      "kl": 0.010422457475215197,
      "learning_rate": 9.68707735062529e-07,
      "loss": 0.0244,
      "num_tokens": 92900035.0,
      "reward": 0.8276056051254272,
      "reward_std": 0.32421788573265076,
      "rewards/reward_func/mean": 0.8276056051254272,
      "rewards/reward_func/std": 0.32421791553497314,
      "step": 3379,
      "step_time": 20.422037471085787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 207.8125,
      "completions/mean_terminated_length": 207.8125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.3677055314183235,
      "epoch": 0.15655396016674386,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10390904545783997,
      "kl": 0.0069690506206825376,
      "learning_rate": 9.6869847151459e-07,
      "loss": -0.0244,
      "num_tokens": 92937488.0,
      "reward": 0.7645001411437988,
      "reward_std": 0.3923640847206116,
      "rewards/reward_func/mean": 0.7645001411437988,
      "rewards/reward_func/std": 0.39236411452293396,
      "step": 3380,
      "step_time": 24.310496348887682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 175.4375,
      "completions/mean_terminated_length": 175.4375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.37994876503944397,
      "epoch": 0.15660027790643816,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014101024717092514,
      "kl": 0.0016320306458510458,
      "learning_rate": 9.686892079666512e-07,
      "loss": 0.0001,
      "num_tokens": 92968039.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3381,
      "step_time": 20.314433723688126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 149.1875,
      "completions/mean_terminated_length": 149.1875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.42657849192619324,
      "epoch": 0.15664659564613248,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002053258242085576,
      "kl": 0.002325157751329243,
      "learning_rate": 9.686799444187123e-07,
      "loss": 0.0001,
      "num_tokens": 93022938.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3382,
      "step_time": 25.133915796875954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.23933277279138565,
      "epoch": 0.15669291338582678,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10873263329267502,
      "kl": 0.005963696981780231,
      "learning_rate": 9.686706808707735e-07,
      "loss": 0.0141,
      "num_tokens": 93055401.0,
      "reward": 0.8300195336341858,
      "reward_std": 0.0028690295293927193,
      "rewards/reward_func/mean": 0.8300195336341858,
      "rewards/reward_func/std": 0.0028690400067716837,
      "step": 3383,
      "step_time": 19.60477663949132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 151.0,
      "completions/mean_terminated_length": 151.0,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3595099225640297,
      "epoch": 0.15673923112552107,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030787335708737373,
      "kl": 0.0030947758932597935,
      "learning_rate": 9.686614173228346e-07,
      "loss": 0.0002,
      "num_tokens": 93104105.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3384,
      "step_time": 23.159548055380583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 178.875,
      "completions/mean_terminated_length": 178.875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.4038897082209587,
      "epoch": 0.15678554886521537,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031293348874896765,
      "kl": 0.002823414048179984,
      "learning_rate": 9.686521537748957e-07,
      "loss": 0.0001,
      "num_tokens": 93156199.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3385,
      "step_time": 27.05347828567028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 204.0,
      "completions/mean_terminated_length": 204.0,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.44841163605451584,
      "epoch": 0.1568318666049097,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004759833682328463,
      "kl": 0.004427422536537051,
      "learning_rate": 9.686428902269568e-07,
      "loss": 0.0002,
      "num_tokens": 93190359.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3386,
      "step_time": 23.632544446736574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 152.125,
      "completions/mean_terminated_length": 152.125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3723982274532318,
      "epoch": 0.156878184344604,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029600670095533133,
      "kl": 0.0033330745063722134,
      "learning_rate": 9.68633626679018e-07,
      "loss": 0.0002,
      "num_tokens": 93245225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3387,
      "step_time": 25.62436442449689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 214.1875,
      "completions/mean_terminated_length": 214.1875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.4489590525627136,
      "epoch": 0.15692450208429828,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11331822723150253,
      "kl": 0.00677445693872869,
      "learning_rate": 9.686243631310793e-07,
      "loss": 0.0583,
      "num_tokens": 93267500.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 3388,
      "step_time": 22.353625752031803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 158.75,
      "completions/mean_terminated_length": 158.75,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3952978104352951,
      "epoch": 0.15697081982399258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005898735020309687,
      "kl": 0.00412388431141153,
      "learning_rate": 9.686150995831402e-07,
      "loss": 0.0002,
      "num_tokens": 93321672.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3389,
      "step_time": 24.38019158691168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 140.375,
      "completions/mean_terminated_length": 140.375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.22559742256999016,
      "epoch": 0.1570171375636869,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005241250153630972,
      "kl": 0.0028257886588107795,
      "learning_rate": 9.686058360352013e-07,
      "loss": 0.0001,
      "num_tokens": 93341342.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3390,
      "step_time": 14.31430471688509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 131.5,
      "completions/mean_terminated_length": 131.5,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.3970692455768585,
      "epoch": 0.1570634553033812,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001841334393247962,
      "kl": 0.0016890050610527396,
      "learning_rate": 9.685965724872625e-07,
      "loss": 0.0001,
      "num_tokens": 93371558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3391,
      "step_time": 16.866381518542767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 175.3125,
      "completions/mean_terminated_length": 175.3125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.18258456885814667,
      "epoch": 0.1571097730430755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019344153115525842,
      "kl": 0.0013503170921467245,
      "learning_rate": 9.685873089393238e-07,
      "loss": 0.0001,
      "num_tokens": 93398379.0,
      "reward": 0.3545035123825073,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3545035123825073,
      "rewards/reward_func/std": 0.0,
      "step": 3392,
      "step_time": 18.63642694428563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 315.0,
      "completions/max_terminated_length": 315.0,
      "completions/mean_length": 230.1875,
      "completions/mean_terminated_length": 230.1875,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.4675423130393028,
      "epoch": 0.1571560907827698,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004736447241157293,
      "kl": 0.004607197945006192,
      "learning_rate": 9.68578045391385e-07,
      "loss": 0.0002,
      "num_tokens": 93429726.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3393,
      "step_time": 27.96315587684512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 303.0,
      "completions/max_terminated_length": 303.0,
      "completions/mean_length": 189.9375,
      "completions/mean_terminated_length": 189.9375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3540070205926895,
      "epoch": 0.15720240852246412,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12353695929050446,
      "kl": 0.009493994759395719,
      "learning_rate": 9.68568781843446e-07,
      "loss": -0.142,
      "num_tokens": 93455453.0,
      "reward": 0.028723783791065216,
      "reward_std": 0.0784883201122284,
      "rewards/reward_func/mean": 0.028723783791065216,
      "rewards/reward_func/std": 0.0784883201122284,
      "step": 3394,
      "step_time": 26.026387214660645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 132.5,
      "completions/mean_terminated_length": 132.5,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3240819647908211,
      "epoch": 0.1572487262621584,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002312229946255684,
      "kl": 0.0019831361423712224,
      "learning_rate": 9.685595182955072e-07,
      "loss": 0.0001,
      "num_tokens": 93476197.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3395,
      "step_time": 15.30780778080225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 293.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 220.0,
      "completions/mean_terminated_length": 220.0,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.20126070827245712,
      "epoch": 0.1572950440018527,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08587631583213806,
      "kl": 0.004368889727629721,
      "learning_rate": 9.685502547475683e-07,
      "loss": 0.0104,
      "num_tokens": 93515397.0,
      "reward": 0.996159553527832,
      "reward_std": 0.015361929312348366,
      "rewards/reward_func/mean": 0.996159553527832,
      "rewards/reward_func/std": 0.015361934900283813,
      "step": 3396,
      "step_time": 29.787671122699976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 179.5625,
      "completions/mean_terminated_length": 179.5625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.26427941769361496,
      "epoch": 0.157341361741547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009840679354965687,
      "kl": 0.004796617780812085,
      "learning_rate": 9.685409911996294e-07,
      "loss": 0.0002,
      "num_tokens": 93543454.0,
      "reward": 0.7598356604576111,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7598356604576111,
      "rewards/reward_func/std": 0.0,
      "step": 3397,
      "step_time": 21.67714450880885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 129.1875,
      "completions/mean_terminated_length": 129.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.28759458661079407,
      "epoch": 0.15738767948124133,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002306500216946006,
      "kl": 0.0020671751408372074,
      "learning_rate": 9.685317276516905e-07,
      "loss": 0.0001,
      "num_tokens": 93564065.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3398,
      "step_time": 14.972420245409012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 214.625,
      "completions/mean_terminated_length": 214.625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.24402768164873123,
      "epoch": 0.15743399722093562,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09991750866174698,
      "kl": 0.01453024661168456,
      "learning_rate": 9.685224641037517e-07,
      "loss": -0.2018,
      "num_tokens": 93587323.0,
      "reward": 0.5235850811004639,
      "reward_std": 0.4175490736961365,
      "rewards/reward_func/mean": 0.5235850811004639,
      "rewards/reward_func/std": 0.41754910349845886,
      "step": 3399,
      "step_time": 23.29434657841921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 191.5,
      "completions/mean_terminated_length": 191.5,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.2214699350297451,
      "epoch": 0.15748031496062992,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007350526284426451,
      "kl": 0.021009589545428753,
      "learning_rate": 9.685132005558128e-07,
      "loss": 0.001,
      "num_tokens": 93614323.0,
      "reward": 0.8071032166481018,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8071032166481018,
      "rewards/reward_func/std": 0.0,
      "step": 3400,
      "step_time": 19.339279111474752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 172.625,
      "completions/mean_terminated_length": 172.625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.20853421837091446,
      "epoch": 0.15752663270032422,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007097158581018448,
      "kl": 0.006123285507783294,
      "learning_rate": 9.68503937007874e-07,
      "loss": 0.0003,
      "num_tokens": 93649309.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3401,
      "step_time": 21.030549950897694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 118.6875,
      "completions/mean_terminated_length": 118.6875,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.24639840051531792,
      "epoch": 0.15757295044001854,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006378231104463339,
      "kl": 0.003113148733973503,
      "learning_rate": 9.68494673459935e-07,
      "loss": 0.0002,
      "num_tokens": 93668776.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3402,
      "step_time": 13.50205684453249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 183.8125,
      "completions/mean_terminated_length": 183.8125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.393598273396492,
      "epoch": 0.15761926817971283,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025114002637565136,
      "kl": 0.0025301979621872306,
      "learning_rate": 9.684854099119962e-07,
      "loss": 0.0001,
      "num_tokens": 93696293.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3403,
      "step_time": 20.72625645622611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 295.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 222.5625,
      "completions/mean_terminated_length": 222.5625,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.356412336230278,
      "epoch": 0.15766558591940713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12194860726594925,
      "kl": 0.012284463155083358,
      "learning_rate": 9.684761463640573e-07,
      "loss": -0.1091,
      "num_tokens": 93734302.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3404,
      "step_time": 28.05081032589078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 188.5625,
      "completions/mean_terminated_length": 188.5625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3936387225985527,
      "epoch": 0.15771190365910143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1348794847726822,
      "kl": 0.007617619470693171,
      "learning_rate": 9.684668828161186e-07,
      "loss": 0.0477,
      "num_tokens": 93758359.0,
      "reward": 0.34241390228271484,
      "reward_std": 0.4565494656562805,
      "rewards/reward_func/mean": 0.34241390228271484,
      "rewards/reward_func/std": 0.4565494656562805,
      "step": 3405,
      "step_time": 22.871876165270805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 146.9375,
      "completions/mean_terminated_length": 146.9375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.18180694431066513,
      "epoch": 0.15775822139879575,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004647223278880119,
      "kl": 0.00461869384162128,
      "learning_rate": 9.684576192681798e-07,
      "loss": 0.0002,
      "num_tokens": 93780614.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3406,
      "step_time": 17.274998400360346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 133.75,
      "completions/mean_terminated_length": 133.75,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.29551438987255096,
      "epoch": 0.15780453913849005,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020865807309746742,
      "kl": 0.001857791270595044,
      "learning_rate": 9.684483557202409e-07,
      "loss": 0.0001,
      "num_tokens": 93809538.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3407,
      "step_time": 16.46083966270089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 217.6875,
      "completions/mean_terminated_length": 217.6875,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.15124435350298882,
      "epoch": 0.15785085687818434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00468370970338583,
      "kl": 0.0036985211190767586,
      "learning_rate": 9.68439092172302e-07,
      "loss": 0.0002,
      "num_tokens": 93835053.0,
      "reward": 0.8970033526420593,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8970033526420593,
      "rewards/reward_func/std": 0.0,
      "step": 3408,
      "step_time": 21.989070676267147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 196.0,
      "completions/mean_terminated_length": 196.0,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.19563382118940353,
      "epoch": 0.15789717461787864,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14159588515758514,
      "kl": 0.008461256831651554,
      "learning_rate": 9.684298286243631e-07,
      "loss": -0.0181,
      "num_tokens": 93856813.0,
      "reward": 0.5811158418655396,
      "reward_std": 0.2563861012458801,
      "rewards/reward_func/mean": 0.5811158418655396,
      "rewards/reward_func/std": 0.2563861012458801,
      "step": 3409,
      "step_time": 18.760935347527266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 177.75,
      "completions/mean_terminated_length": 177.75,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.22291510179638863,
      "epoch": 0.15794349235757296,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003395486855879426,
      "kl": 0.0027317614876665175,
      "learning_rate": 9.684205650764243e-07,
      "loss": 0.0001,
      "num_tokens": 93890249.0,
      "reward": 0.8970773816108704,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8970773816108704,
      "rewards/reward_func/std": 0.0,
      "step": 3410,
      "step_time": 20.72183408588171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 215.0625,
      "completions/mean_terminated_length": 215.0625,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.32530076056718826,
      "epoch": 0.15798981009726726,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08521021157503128,
      "kl": 0.008857256500050426,
      "learning_rate": 9.684113015284854e-07,
      "loss": 0.0267,
      "num_tokens": 93918442.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3411,
      "step_time": 24.42685490101576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 127.9375,
      "completions/mean_terminated_length": 127.9375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.27352775633335114,
      "epoch": 0.15803612783696155,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023881150409579277,
      "kl": 0.0020404542156029493,
      "learning_rate": 9.684020379805465e-07,
      "loss": 0.0001,
      "num_tokens": 93940073.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3412,
      "step_time": 13.613922379910946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 143.75,
      "completions/mean_terminated_length": 143.75,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3109496012330055,
      "epoch": 0.15808244557665585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030934831593185663,
      "kl": 0.0021100371377542615,
      "learning_rate": 9.683927744326076e-07,
      "loss": 0.0001,
      "num_tokens": 93963029.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3413,
      "step_time": 15.253663532435894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 199.125,
      "completions/mean_terminated_length": 199.125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.38036343455314636,
      "epoch": 0.15812876331635017,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09645237028598785,
      "kl": 0.006299379514530301,
      "learning_rate": 9.683835108846688e-07,
      "loss": 0.0145,
      "num_tokens": 93984919.0,
      "reward": 0.3125,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.3125,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 3414,
      "step_time": 20.447531413286924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 124.125,
      "completions/mean_terminated_length": 124.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.30600807815790176,
      "epoch": 0.15817508105604447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006566811352968216,
      "kl": 0.0029020439833402634,
      "learning_rate": 9.683742473367299e-07,
      "loss": 0.0001,
      "num_tokens": 94006569.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3415,
      "step_time": 14.760872717946768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 306.8125,
      "completions/mean_terminated_length": 306.8125,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "entropy": 0.3685658276081085,
      "epoch": 0.15822139879573877,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06619012355804443,
      "kl": 0.0029626351897604764,
      "learning_rate": 9.68364983788791e-07,
      "loss": 0.079,
      "num_tokens": 94043254.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3416,
      "step_time": 33.7754629291594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 164.75,
      "completions/mean_terminated_length": 164.75,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.37912197411060333,
      "epoch": 0.15826771653543306,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1322353631258011,
      "kl": 0.008519368944689631,
      "learning_rate": 9.683557202408521e-07,
      "loss": -0.0144,
      "num_tokens": 94065906.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3417,
      "step_time": 20.565210532397032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 132.75,
      "completions/mean_terminated_length": 132.75,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.32533005625009537,
      "epoch": 0.15831403427512739,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031542836222797632,
      "kl": 0.002352894749492407,
      "learning_rate": 9.683464566929135e-07,
      "loss": 0.0001,
      "num_tokens": 94101870.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3418,
      "step_time": 17.876115828752518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 214.0,
      "completions/mean_terminated_length": 214.0,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.25735868141055107,
      "epoch": 0.15836035201482168,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012005128664895892,
      "kl": 0.0012013968371320516,
      "learning_rate": 9.683371931449746e-07,
      "loss": 0.0001,
      "num_tokens": 94140622.0,
      "reward": 0.5623413324356079,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5623413324356079,
      "rewards/reward_func/std": 0.0,
      "step": 3419,
      "step_time": 25.61725616827607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 178.0625,
      "completions/mean_terminated_length": 178.0625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3643127679824829,
      "epoch": 0.15840666975451598,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003521176055073738,
      "kl": 0.0029144013533368707,
      "learning_rate": 9.683279295970355e-07,
      "loss": 0.0001,
      "num_tokens": 94162527.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3420,
      "step_time": 20.675210751593113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 267.9375,
      "completions/mean_terminated_length": 267.9375,
      "completions/min_length": 240.0,
      "completions/min_terminated_length": 240.0,
      "entropy": 0.23120523989200592,
      "epoch": 0.15845298749421027,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038808188401162624,
      "kl": 0.002891894255299121,
      "learning_rate": 9.683186660490966e-07,
      "loss": 0.0001,
      "num_tokens": 94195630.0,
      "reward": 0.7480222582817078,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7480222582817078,
      "rewards/reward_func/std": 0.0,
      "step": 3421,
      "step_time": 27.111752171069384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 182.875,
      "completions/mean_terminated_length": 182.875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.4212314486503601,
      "epoch": 0.1584993052339046,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036072733346372843,
      "kl": 0.0031521882046945393,
      "learning_rate": 9.68309402501158e-07,
      "loss": 0.0002,
      "num_tokens": 94216844.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3422,
      "step_time": 18.432662308216095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 244.5625,
      "completions/mean_terminated_length": 244.5625,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "entropy": 0.24329178407788277,
      "epoch": 0.1585456229735989,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008524253964424133,
      "kl": 0.00909702992066741,
      "learning_rate": 9.68300138953219e-07,
      "loss": 0.0005,
      "num_tokens": 94254085.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3423,
      "step_time": 25.48354296386242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 209.0625,
      "completions/mean_terminated_length": 209.0625,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.2374827116727829,
      "epoch": 0.1585919407132932,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008499586954712868,
      "kl": 0.0075718306470662355,
      "learning_rate": 9.682908754052802e-07,
      "loss": 0.0004,
      "num_tokens": 94283974.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3424,
      "step_time": 22.72313467785716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 133.4375,
      "completions/mean_terminated_length": 133.4375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2880973070859909,
      "epoch": 0.15863825845298749,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020167441107332706,
      "kl": 0.0015973700792528689,
      "learning_rate": 9.682816118573413e-07,
      "loss": 0.0001,
      "num_tokens": 94311341.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3425,
      "step_time": 15.675929341465235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 227.0,
      "completions/mean_terminated_length": 227.0,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.2634507790207863,
      "epoch": 0.1586845761926818,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007204752415418625,
      "kl": 0.006101812236011028,
      "learning_rate": 9.682723483094025e-07,
      "loss": 0.0003,
      "num_tokens": 94336733.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3426,
      "step_time": 23.877611380070448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 165.8125,
      "completions/mean_terminated_length": 165.8125,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.16471055895090103,
      "epoch": 0.1587308939323761,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032392528373748064,
      "kl": 0.0022068023099564016,
      "learning_rate": 9.682630847614636e-07,
      "loss": 0.0001,
      "num_tokens": 94370186.0,
      "reward": 0.8781879544258118,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8781879544258118,
      "rewards/reward_func/std": 0.0,
      "step": 3427,
      "step_time": 19.545306116342545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 207.625,
      "completions/mean_terminated_length": 207.625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.40462759882211685,
      "epoch": 0.1587772116720704,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10641691088676453,
      "kl": 0.013477440923452377,
      "learning_rate": 9.682538212135247e-07,
      "loss": -0.1892,
      "num_tokens": 94394916.0,
      "reward": 0.23951296508312225,
      "reward_std": 0.4300934672355652,
      "rewards/reward_func/mean": 0.23951296508312225,
      "rewards/reward_func/std": 0.4300934970378876,
      "step": 3428,
      "step_time": 26.73448269441724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 126.5625,
      "completions/mean_terminated_length": 126.5625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2731931433081627,
      "epoch": 0.1588235294117647,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023880538064986467,
      "kl": 0.0017022228857968003,
      "learning_rate": 9.682445576655858e-07,
      "loss": 0.0001,
      "num_tokens": 94414557.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3429,
      "step_time": 13.645759027451277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 174.5625,
      "completions/mean_terminated_length": 174.5625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.33578938245773315,
      "epoch": 0.15886984715145902,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002920533297583461,
      "kl": 0.001993319718167186,
      "learning_rate": 9.68235294117647e-07,
      "loss": 0.0001,
      "num_tokens": 94445430.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3430,
      "step_time": 21.873732414096594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 176.5625,
      "completions/mean_terminated_length": 176.5625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3507569134235382,
      "epoch": 0.15891616489115332,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032085312996059656,
      "kl": 0.0025718085817061365,
      "learning_rate": 9.68226030569708e-07,
      "loss": 0.0001,
      "num_tokens": 94466815.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3431,
      "step_time": 19.05837071686983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 161.375,
      "completions/mean_terminated_length": 161.375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.30730706453323364,
      "epoch": 0.1589624826308476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038303835317492485,
      "kl": 0.0025260396650992334,
      "learning_rate": 9.682167670217692e-07,
      "loss": 0.0001,
      "num_tokens": 94486949.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3432,
      "step_time": 16.480333171784878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 133.6875,
      "completions/mean_terminated_length": 133.6875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.22230102494359016,
      "epoch": 0.1590088003705419,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017625931650400162,
      "kl": 0.0012499088479671627,
      "learning_rate": 9.682075034738303e-07,
      "loss": 0.0001,
      "num_tokens": 94506496.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3433,
      "step_time": 14.038519285619259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 205.25,
      "completions/mean_terminated_length": 205.25,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4865777716040611,
      "epoch": 0.15905511811023623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003165813395753503,
      "kl": 0.0032938916701823473,
      "learning_rate": 9.681982399258915e-07,
      "loss": 0.0002,
      "num_tokens": 94531844.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3434,
      "step_time": 24.92224333807826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 169.9375,
      "completions/mean_terminated_length": 169.9375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.4473801478743553,
      "epoch": 0.15910143584993053,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019321624422445893,
      "kl": 0.002116112707881257,
      "learning_rate": 9.681889763779528e-07,
      "loss": 0.0001,
      "num_tokens": 94560931.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3435,
      "step_time": 21.872719943523407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 132.75,
      "completions/mean_terminated_length": 132.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.24387311190366745,
      "epoch": 0.15914775358962482,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038844021037220955,
      "kl": 0.002167000056942925,
      "learning_rate": 9.68179712830014e-07,
      "loss": 0.0001,
      "num_tokens": 94580335.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3436,
      "step_time": 14.557889740914106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 163.875,
      "completions/mean_terminated_length": 163.875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.33780235052108765,
      "epoch": 0.15919407132931912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008975336328148842,
      "kl": 0.007652397267520428,
      "learning_rate": 9.68170449282075e-07,
      "loss": 0.0004,
      "num_tokens": 94603469.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3437,
      "step_time": 18.062211614102125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 175.0,
      "completions/mean_terminated_length": 175.0,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.26001911610364914,
      "epoch": 0.15924038906901344,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1583942174911499,
      "kl": 0.00648200698196888,
      "learning_rate": 9.681611857341362e-07,
      "loss": 0.0605,
      "num_tokens": 94624125.0,
      "reward": 0.7865698337554932,
      "reward_std": 0.2204297035932541,
      "rewards/reward_func/mean": 0.7865698337554932,
      "rewards/reward_func/std": 0.2204297035932541,
      "step": 3438,
      "step_time": 22.236902624368668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 170.8125,
      "completions/mean_terminated_length": 170.8125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.23996953666210175,
      "epoch": 0.15928670680870774,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1449446976184845,
      "kl": 0.010345470858737826,
      "learning_rate": 9.681519221861973e-07,
      "loss": -0.0544,
      "num_tokens": 94661098.0,
      "reward": 0.609000027179718,
      "reward_std": 0.4912189245223999,
      "rewards/reward_func/mean": 0.609000027179718,
      "rewards/reward_func/std": 0.4912189245223999,
      "step": 3439,
      "step_time": 21.725098561495543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 122.6875,
      "completions/mean_terminated_length": 122.6875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.23615119233727455,
      "epoch": 0.15933302454840204,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022867852821946144,
      "kl": 0.0015611594135407358,
      "learning_rate": 9.681426586382584e-07,
      "loss": 0.0001,
      "num_tokens": 94684325.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3440,
      "step_time": 13.927519869059324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 168.5,
      "completions/mean_terminated_length": 168.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.3941641077399254,
      "epoch": 0.15937934228809633,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005300555378198624,
      "kl": 0.004834897932596505,
      "learning_rate": 9.681333950903196e-07,
      "loss": 0.0002,
      "num_tokens": 94709981.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3441,
      "step_time": 18.493968956172466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 339.0,
      "completions/max_terminated_length": 339.0,
      "completions/mean_length": 257.375,
      "completions/mean_terminated_length": 257.375,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "entropy": 0.23197712376713753,
      "epoch": 0.15942566002779066,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0854479968547821,
      "kl": 0.011729596881195903,
      "learning_rate": 9.681241315423807e-07,
      "loss": -0.0347,
      "num_tokens": 94745571.0,
      "reward": 0.5779284238815308,
      "reward_std": 0.14207902550697327,
      "rewards/reward_func/mean": 0.5779284238815308,
      "rewards/reward_func/std": 0.14207902550697327,
      "step": 3442,
      "step_time": 31.97803706303239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 152.375,
      "completions/mean_terminated_length": 152.375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.20799680799245834,
      "epoch": 0.15947197776748495,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002170357620343566,
      "kl": 0.0017289408133365214,
      "learning_rate": 9.681148679944418e-07,
      "loss": 0.0001,
      "num_tokens": 94780617.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 3443,
      "step_time": 20.633345916867256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 184.0625,
      "completions/mean_terminated_length": 184.0625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.40174292773008347,
      "epoch": 0.15951829550717925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006572690326720476,
      "kl": 0.003529248700942844,
      "learning_rate": 9.68105604446503e-07,
      "loss": 0.0002,
      "num_tokens": 94810666.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3444,
      "step_time": 21.925837852060795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 293.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 212.375,
      "completions/mean_terminated_length": 212.375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.413627065718174,
      "epoch": 0.15956461324687354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005545541178435087,
      "kl": 0.006141959456726909,
      "learning_rate": 9.68096340898564e-07,
      "loss": -0.0017,
      "num_tokens": 94848768.0,
      "reward": 5.986431119708868e-07,
      "reward_std": 1.6358043239961262e-06,
      "rewards/reward_func/mean": 5.986431119708868e-07,
      "rewards/reward_func/std": 1.635804437682964e-06,
      "step": 3445,
      "step_time": 28.424156863242388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 167.25,
      "completions/mean_terminated_length": 167.25,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.38636625558137894,
      "epoch": 0.15961093098656787,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005689219571650028,
      "kl": 0.004969300061929971,
      "learning_rate": 9.680870773506252e-07,
      "loss": 0.0003,
      "num_tokens": 94870532.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3446,
      "step_time": 18.343412913382053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 142.9375,
      "completions/mean_terminated_length": 142.9375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.20394393801689148,
      "epoch": 0.15965724872626216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015257818158715963,
      "kl": 0.001183865446364507,
      "learning_rate": 9.680778138026863e-07,
      "loss": 0.0001,
      "num_tokens": 94890515.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3447,
      "step_time": 14.80742610245943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 194.5625,
      "completions/mean_terminated_length": 194.5625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.34553437680006027,
      "epoch": 0.15970356646595646,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0065522403456270695,
      "kl": 0.005666043201927096,
      "learning_rate": 9.680685502547476e-07,
      "loss": 0.0004,
      "num_tokens": 94917564.0,
      "reward": 1.0834193062692066e-06,
      "reward_std": 5.212616542848991e-07,
      "rewards/reward_func/mean": 1.0834193062692066e-06,
      "rewards/reward_func/std": 5.212616542848991e-07,
      "step": 3448,
      "step_time": 20.05315352603793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 163.9375,
      "completions/mean_terminated_length": 163.9375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.362993448972702,
      "epoch": 0.15974988420565076,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005834056995809078,
      "kl": 0.004242094000801444,
      "learning_rate": 9.680592867068088e-07,
      "loss": 0.0002,
      "num_tokens": 94939003.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3449,
      "step_time": 18.971878744661808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 206.3125,
      "completions/mean_terminated_length": 206.3125,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.24320535734295845,
      "epoch": 0.15979620194534508,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10007061809301376,
      "kl": 0.005496683996170759,
      "learning_rate": 9.680500231588699e-07,
      "loss": 0.0052,
      "num_tokens": 94964368.0,
      "reward": 0.9650192260742188,
      "reward_std": 0.020858503878116608,
      "rewards/reward_func/mean": 0.9650192260742188,
      "rewards/reward_func/std": 0.020858513191342354,
      "step": 3450,
      "step_time": 25.901068847626448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 132.75,
      "completions/mean_terminated_length": 132.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.23371541500091553,
      "epoch": 0.15984251968503937,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002019039588049054,
      "kl": 0.0014723509084433317,
      "learning_rate": 9.680407596109308e-07,
      "loss": 0.0001,
      "num_tokens": 94984268.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3451,
      "step_time": 14.849624052643776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 172.1875,
      "completions/mean_terminated_length": 172.1875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.4102185070514679,
      "epoch": 0.15988883742473367,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003499179147183895,
      "kl": 0.002860469976440072,
      "learning_rate": 9.680314960629921e-07,
      "loss": 0.0001,
      "num_tokens": 95010095.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3452,
      "step_time": 18.349357716739178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 174.125,
      "completions/mean_terminated_length": 174.125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.34826090186834335,
      "epoch": 0.15993515516442797,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002932294737547636,
      "kl": 0.002209923230111599,
      "learning_rate": 9.680222325150533e-07,
      "loss": 0.0001,
      "num_tokens": 95041121.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3453,
      "step_time": 20.477946385741234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.4032389596104622,
      "epoch": 0.1599814729041223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004911939613521099,
      "kl": 0.004299458349123597,
      "learning_rate": 9.680129689671144e-07,
      "loss": 0.0002,
      "num_tokens": 95069233.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3454,
      "step_time": 20.019405771046877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 147.8125,
      "completions/mean_terminated_length": 147.8125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2919314131140709,
      "epoch": 0.1600277906438166,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002293745754286647,
      "kl": 0.0022720624110661447,
      "learning_rate": 9.680037054191755e-07,
      "loss": 0.0001,
      "num_tokens": 95090942.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3455,
      "step_time": 17.30674758180976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 143.5625,
      "completions/mean_terminated_length": 143.5625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.23198441788554192,
      "epoch": 0.16007410838351088,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027736607007682323,
      "kl": 0.0017384123057126999,
      "learning_rate": 9.679944418712366e-07,
      "loss": 0.0001,
      "num_tokens": 95111351.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3456,
      "step_time": 16.20187959820032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 415.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 271.0625,
      "completions/mean_terminated_length": 271.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.27159250155091286,
      "epoch": 0.16012042612320518,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09969770163297653,
      "kl": 0.011295750504359603,
      "learning_rate": 9.679851783232978e-07,
      "loss": -0.3148,
      "num_tokens": 95139416.0,
      "reward": 0.39337778091430664,
      "reward_std": 0.4508124887943268,
      "rewards/reward_func/mean": 0.39337778091430664,
      "rewards/reward_func/std": 0.45081251859664917,
      "step": 3457,
      "step_time": 33.91886493563652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 140.125,
      "completions/mean_terminated_length": 140.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.1908768080174923,
      "epoch": 0.1601667438628995,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002794044790789485,
      "kl": 0.002192254352848977,
      "learning_rate": 9.679759147753589e-07,
      "loss": 0.0001,
      "num_tokens": 95161722.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3458,
      "step_time": 16.333599999547005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 143.9375,
      "completions/mean_terminated_length": 143.9375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.29709840565919876,
      "epoch": 0.1602130616025938,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025707187596708536,
      "kl": 0.001648180972551927,
      "learning_rate": 9.6796665122742e-07,
      "loss": 0.0001,
      "num_tokens": 95184073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3459,
      "step_time": 15.60405432805419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 131.4375,
      "completions/mean_terminated_length": 131.4375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2899218425154686,
      "epoch": 0.1602593793422881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004910370334982872,
      "kl": 0.002971170237287879,
      "learning_rate": 9.679573876794811e-07,
      "loss": 0.0001,
      "num_tokens": 95207440.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3460,
      "step_time": 15.209514487534761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 134.25,
      "completions/mean_terminated_length": 134.25,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.30879829823970795,
      "epoch": 0.1603056970819824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002629196736961603,
      "kl": 0.0021158503368496895,
      "learning_rate": 9.679481241315423e-07,
      "loss": 0.0001,
      "num_tokens": 95233124.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3461,
      "step_time": 16.275724075734615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 174.6875,
      "completions/mean_terminated_length": 174.6875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.1905820220708847,
      "epoch": 0.1603520148216767,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032742188777774572,
      "kl": 0.0061743218684569,
      "learning_rate": 9.679388605836036e-07,
      "loss": 0.0003,
      "num_tokens": 95254527.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3462,
      "step_time": 17.634504687041044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 192.8125,
      "completions/mean_terminated_length": 192.8125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.2610615938901901,
      "epoch": 0.160398332561371,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22092193365097046,
      "kl": 0.006980406120419502,
      "learning_rate": 9.679295970356645e-07,
      "loss": -0.07,
      "num_tokens": 95303628.0,
      "reward": 0.0852164477109909,
      "reward_std": 0.05081327259540558,
      "rewards/reward_func/mean": 0.0852164477109909,
      "rewards/reward_func/std": 0.05081327632069588,
      "step": 3463,
      "step_time": 27.251349058002234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 151.0,
      "completions/mean_terminated_length": 151.0,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.19659963622689247,
      "epoch": 0.1604446503010653,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15603183209896088,
      "kl": 0.010294223902747035,
      "learning_rate": 9.679203334877256e-07,
      "loss": -0.0192,
      "num_tokens": 95324828.0,
      "reward": 0.9464435577392578,
      "reward_std": 0.014281725510954857,
      "rewards/reward_func/mean": 0.9464435577392578,
      "rewards/reward_func/std": 0.014281720854341984,
      "step": 3464,
      "step_time": 16.60369373112917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 273.3125,
      "completions/mean_terminated_length": 273.3125,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "entropy": 0.2528546415269375,
      "epoch": 0.1604909680407596,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09785427898168564,
      "kl": 0.003852492547594011,
      "learning_rate": 9.67911069939787e-07,
      "loss": 0.0099,
      "num_tokens": 95360001.0,
      "reward": 0.1862429976463318,
      "reward_std": 0.049664802849292755,
      "rewards/reward_func/mean": 0.1862429976463318,
      "rewards/reward_func/std": 0.04966479912400246,
      "step": 3465,
      "step_time": 29.23867255076766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 199.0625,
      "completions/mean_terminated_length": 199.0625,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.3757583796977997,
      "epoch": 0.16053728578045393,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13698962330818176,
      "kl": 0.015001054853200912,
      "learning_rate": 9.67901806391848e-07,
      "loss": -0.0277,
      "num_tokens": 95392098.0,
      "reward": 0.2977655529975891,
      "reward_std": 0.40756502747535706,
      "rewards/reward_func/mean": 0.2977655529975891,
      "rewards/reward_func/std": 0.40756499767303467,
      "step": 3466,
      "step_time": 21.701540529727936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 178.9375,
      "completions/mean_terminated_length": 178.9375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.38655295222997665,
      "epoch": 0.16058360352014822,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00909447856247425,
      "kl": 0.0066490627359598875,
      "learning_rate": 9.678925428439092e-07,
      "loss": 0.0003,
      "num_tokens": 95413793.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3467,
      "step_time": 19.39699961617589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 132.4375,
      "completions/mean_terminated_length": 132.4375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.29878050088882446,
      "epoch": 0.16062992125984252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004993073642253876,
      "kl": 0.00306686176918447,
      "learning_rate": 9.678832792959704e-07,
      "loss": 0.0002,
      "num_tokens": 95436760.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3468,
      "step_time": 15.505547307431698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 127.8125,
      "completions/mean_terminated_length": 127.8125,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3205405920743942,
      "epoch": 0.1606762389995368,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023301353212445974,
      "kl": 0.001960799883818254,
      "learning_rate": 9.678740157480315e-07,
      "loss": 0.0001,
      "num_tokens": 95462165.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3469,
      "step_time": 15.969914954155684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 165.3125,
      "completions/mean_terminated_length": 165.3125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.4200323596596718,
      "epoch": 0.16072255673923114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003014979185536504,
      "kl": 0.0024219228071160614,
      "learning_rate": 9.678647522000926e-07,
      "loss": 0.0001,
      "num_tokens": 95507978.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3470,
      "step_time": 22.58441649377346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 111.375,
      "completions/mean_terminated_length": 111.375,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.2844608575105667,
      "epoch": 0.16076887447892543,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003143490757793188,
      "kl": 0.0021750289015471935,
      "learning_rate": 9.678554886521537e-07,
      "loss": 0.0001,
      "num_tokens": 95527488.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3471,
      "step_time": 12.80592230707407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 174.375,
      "completions/mean_terminated_length": 174.375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.2442374974489212,
      "epoch": 0.16081519221861973,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021279064938426018,
      "kl": 0.012040883302688599,
      "learning_rate": 9.678462251042148e-07,
      "loss": 0.0006,
      "num_tokens": 95548198.0,
      "reward": 0.5986744165420532,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5986744165420532,
      "rewards/reward_func/std": 0.0,
      "step": 3472,
      "step_time": 18.40150808915496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 163.25,
      "completions/mean_terminated_length": 163.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.412839911878109,
      "epoch": 0.16086150995831403,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007059311494231224,
      "kl": 0.004065240442287177,
      "learning_rate": 9.67836961556276e-07,
      "loss": 0.0002,
      "num_tokens": 95569018.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3473,
      "step_time": 17.311843056231737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 132.5625,
      "completions/mean_terminated_length": 132.5625,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.27657292783260345,
      "epoch": 0.16090782769800835,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030245569068938494,
      "kl": 0.0019606098940130323,
      "learning_rate": 9.67827698008337e-07,
      "loss": 0.0001,
      "num_tokens": 95588675.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3474,
      "step_time": 14.221028413623571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 202.8125,
      "completions/mean_terminated_length": 202.8125,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.3112871088087559,
      "epoch": 0.16095414543770264,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10125451534986496,
      "kl": 0.017683086451143026,
      "learning_rate": 9.678184344603982e-07,
      "loss": -0.0175,
      "num_tokens": 95617088.0,
      "reward": 0.5819142460823059,
      "reward_std": 0.4655313789844513,
      "rewards/reward_func/mean": 0.5819142460823059,
      "rewards/reward_func/std": 0.4655313789844513,
      "step": 3475,
      "step_time": 21.86097837984562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 128.9375,
      "completions/mean_terminated_length": 128.9375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2914179116487503,
      "epoch": 0.16100046317739694,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024098509456962347,
      "kl": 0.001557468785904348,
      "learning_rate": 9.678091709124593e-07,
      "loss": 0.0001,
      "num_tokens": 95638015.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3476,
      "step_time": 13.901871923357248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 154.125,
      "completions/mean_terminated_length": 154.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.4389551281929016,
      "epoch": 0.16104678091709124,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016640285030007362,
      "kl": 0.0019833649857901037,
      "learning_rate": 9.677999073645205e-07,
      "loss": 0.0001,
      "num_tokens": 95681729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3477,
      "step_time": 21.65598686784506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 165.8125,
      "completions/mean_terminated_length": 165.8125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.23619088158011436,
      "epoch": 0.16109309865678556,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006988085340708494,
      "kl": 0.004499021626543254,
      "learning_rate": 9.677906438165818e-07,
      "loss": 0.0002,
      "num_tokens": 95702462.0,
      "reward": 0.29761362075805664,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.29761362075805664,
      "rewards/reward_func/std": 0.0,
      "step": 3478,
      "step_time": 17.2142390049994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 130.25,
      "completions/mean_terminated_length": 130.25,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.29452070593833923,
      "epoch": 0.16113941639647986,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002671242458745837,
      "kl": 0.0019010700343642384,
      "learning_rate": 9.67781380268643e-07,
      "loss": 0.0001,
      "num_tokens": 95722978.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3479,
      "step_time": 15.805702719837427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 175.875,
      "completions/mean_terminated_length": 175.875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.33927278220653534,
      "epoch": 0.16118573413617415,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12174668908119202,
      "kl": 0.011155621381476521,
      "learning_rate": 9.67772116720704e-07,
      "loss": -0.0707,
      "num_tokens": 95746160.0,
      "reward": 0.04252343252301216,
      "reward_std": 0.17009373009204865,
      "rewards/reward_func/mean": 0.04252343252301216,
      "rewards/reward_func/std": 0.17009373009204865,
      "step": 3480,
      "step_time": 20.51972856372595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 197.0,
      "completions/mean_terminated_length": 197.0,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.23060177639126778,
      "epoch": 0.16123205187586845,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014210606925189495,
      "kl": 0.0080897071165964,
      "learning_rate": 9.67762853172765e-07,
      "loss": 0.0004,
      "num_tokens": 95768144.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3481,
      "step_time": 18.474943548440933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 192.3125,
      "completions/mean_terminated_length": 192.3125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.2537849582731724,
      "epoch": 0.16127836961556277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038933195173740387,
      "kl": 0.003093552717473358,
      "learning_rate": 9.677535896248263e-07,
      "loss": 0.0002,
      "num_tokens": 95805557.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3482,
      "step_time": 23.168149556964636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 134.8125,
      "completions/mean_terminated_length": 134.8125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.38977406173944473,
      "epoch": 0.16132468735525707,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028483986388891935,
      "kl": 0.002649834379553795,
      "learning_rate": 9.677443260768874e-07,
      "loss": 0.0001,
      "num_tokens": 95847650.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3483,
      "step_time": 19.173815231770277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 362.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 339.1875,
      "completions/mean_terminated_length": 339.1875,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "entropy": 0.21618995442986488,
      "epoch": 0.16137100509495136,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0660412535071373,
      "kl": 0.0036207985249347985,
      "learning_rate": 9.677350625289486e-07,
      "loss": 0.0025,
      "num_tokens": 95875797.0,
      "reward": 0.977266788482666,
      "reward_std": 0.014211704954504967,
      "rewards/reward_func/mean": 0.977266788482666,
      "rewards/reward_func/std": 0.014211706817150116,
      "step": 3484,
      "step_time": 30.194304917007685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 178.125,
      "completions/mean_terminated_length": 178.125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.36701615154743195,
      "epoch": 0.16141732283464566,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004110343288630247,
      "kl": 0.0034016179852187634,
      "learning_rate": 9.677257989810097e-07,
      "loss": 0.0002,
      "num_tokens": 95897751.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3485,
      "step_time": 20.234230373054743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 172.75,
      "completions/mean_terminated_length": 172.75,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.20842647179961205,
      "epoch": 0.16146364057433998,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00140270940028131,
      "kl": 0.0012515323469415307,
      "learning_rate": 9.677165354330708e-07,
      "loss": 0.0001,
      "num_tokens": 95924563.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 3486,
      "step_time": 18.932017970830202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 213.0625,
      "completions/mean_terminated_length": 213.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.36111921072006226,
      "epoch": 0.16150995831403428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10579774528741837,
      "kl": 0.014420822961255908,
      "learning_rate": 9.67707271885132e-07,
      "loss": -0.217,
      "num_tokens": 95947828.0,
      "reward": 0.22921621799468994,
      "reward_std": 0.2373117208480835,
      "rewards/reward_func/mean": 0.22921621799468994,
      "rewards/reward_func/std": 0.2373117208480835,
      "step": 3487,
      "step_time": 26.59403732419014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 194.4375,
      "completions/mean_terminated_length": 194.4375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.19422125443816185,
      "epoch": 0.16155627605372858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08066341280937195,
      "kl": 0.007530417904490605,
      "learning_rate": 9.67698008337193e-07,
      "loss": -0.0174,
      "num_tokens": 95984987.0,
      "reward": 0.8204387426376343,
      "reward_std": 0.050927866250276566,
      "rewards/reward_func/mean": 0.8204387426376343,
      "rewards/reward_func/std": 0.05092788115143776,
      "step": 3488,
      "step_time": 22.20290519297123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 166.375,
      "completions/mean_terminated_length": 166.375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.2949298843741417,
      "epoch": 0.16160259379342287,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029069860465824604,
      "kl": 0.002154695277567953,
      "learning_rate": 9.676887447892542e-07,
      "loss": 0.0001,
      "num_tokens": 96011553.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3489,
      "step_time": 17.839666597545147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 182.5,
      "completions/mean_terminated_length": 182.5,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.3527740314602852,
      "epoch": 0.1616489115331172,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11044298112392426,
      "kl": 0.007320728502236307,
      "learning_rate": 9.676794812413153e-07,
      "loss": -0.0366,
      "num_tokens": 96044457.0,
      "reward": 0.06291896849870682,
      "reward_std": 0.13527171313762665,
      "rewards/reward_func/mean": 0.06291896849870682,
      "rewards/reward_func/std": 0.13527172803878784,
      "step": 3490,
      "step_time": 22.78624314442277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 161.9375,
      "completions/mean_terminated_length": 161.9375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.2925775647163391,
      "epoch": 0.1616952292728115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006018325686454773,
      "kl": 0.004330728843342513,
      "learning_rate": 9.676702176933764e-07,
      "loss": 0.0002,
      "num_tokens": 96070088.0,
      "reward": 0.3678794503211975,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3678794503211975,
      "rewards/reward_func/std": 0.0,
      "step": 3491,
      "step_time": 17.549418538808823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 207.3125,
      "completions/mean_terminated_length": 207.3125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.2782725617289543,
      "epoch": 0.1617415470125058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11863549798727036,
      "kl": 0.02717333915643394,
      "learning_rate": 9.676609541454378e-07,
      "loss": -0.0041,
      "num_tokens": 96091853.0,
      "reward": 0.9162025451660156,
      "reward_std": 0.2586844563484192,
      "rewards/reward_func/mean": 0.9162025451660156,
      "rewards/reward_func/std": 0.2586844563484192,
      "step": 3492,
      "step_time": 20.521996207535267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 189.5,
      "completions/mean_terminated_length": 189.5,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.39548636227846146,
      "epoch": 0.16178786475220008,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10352324694395065,
      "kl": 0.007150716031901538,
      "learning_rate": 9.67651690597499e-07,
      "loss": -0.0549,
      "num_tokens": 96114565.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3493,
      "step_time": 20.147456903010607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 217.3125,
      "completions/mean_terminated_length": 217.3125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.36438772827386856,
      "epoch": 0.1618341824918944,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11570117622613907,
      "kl": 0.004850264871492982,
      "learning_rate": 9.676424270495598e-07,
      "loss": -0.0339,
      "num_tokens": 96144538.0,
      "reward": 0.6824508905410767,
      "reward_std": 0.4069547653198242,
      "rewards/reward_func/mean": 0.6824508905410767,
      "rewards/reward_func/std": 0.4069547653198242,
      "step": 3494,
      "step_time": 23.432373207062483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 169.375,
      "completions/mean_terminated_length": 169.375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.406574584543705,
      "epoch": 0.1618805002315887,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002736002206802368,
      "kl": 0.002772693696897477,
      "learning_rate": 9.676331635016211e-07,
      "loss": 0.0001,
      "num_tokens": 96183808.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3495,
      "step_time": 20.829304948449135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 356.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 266.9375,
      "completions/mean_terminated_length": 266.9375,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.4659477397799492,
      "epoch": 0.161926817971283,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11198234558105469,
      "kl": 0.006621643784455955,
      "learning_rate": 9.676238999536823e-07,
      "loss": 0.1292,
      "num_tokens": 96213423.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3496,
      "step_time": 31.571284301579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 126.4375,
      "completions/mean_terminated_length": 126.4375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.282311387360096,
      "epoch": 0.1619731357109773,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004010648932307959,
      "kl": 0.0024612965062260628,
      "learning_rate": 9.676146364057434e-07,
      "loss": 0.0001,
      "num_tokens": 96234886.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3497,
      "step_time": 16.80099131911993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 136.6875,
      "completions/mean_terminated_length": 136.6875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2940609082579613,
      "epoch": 0.16201945345067162,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00230865809135139,
      "kl": 0.0018131097604054958,
      "learning_rate": 9.676053728578045e-07,
      "loss": 0.0001,
      "num_tokens": 96262481.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3498,
      "step_time": 16.95644073560834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 186.4375,
      "completions/mean_terminated_length": 186.4375,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.2812589779496193,
      "epoch": 0.16206577119036591,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13529819250106812,
      "kl": 0.026054322253912687,
      "learning_rate": 9.675961093098656e-07,
      "loss": 0.1005,
      "num_tokens": 96308840.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3499,
      "step_time": 27.80487647652626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 123.9375,
      "completions/mean_terminated_length": 123.9375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2315989024937153,
      "epoch": 0.1621120889300602,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006247741635888815,
      "kl": 0.003260422556195408,
      "learning_rate": 9.675868457619268e-07,
      "loss": 0.0002,
      "num_tokens": 96328231.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3500,
      "step_time": 13.85839718952775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 175.9375,
      "completions/mean_terminated_length": 175.9375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.22639838978648186,
      "epoch": 0.1621584066697545,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1781572848558426,
      "kl": 0.011106195393949747,
      "learning_rate": 9.67577582213988e-07,
      "loss": -0.0405,
      "num_tokens": 96350086.0,
      "reward": 0.3874533176422119,
      "reward_std": 0.26677781343460083,
      "rewards/reward_func/mean": 0.3874533176422119,
      "rewards/reward_func/std": 0.26677781343460083,
      "step": 3501,
      "step_time": 18.180234760046005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 139.9375,
      "completions/mean_terminated_length": 139.9375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.30354005843400955,
      "epoch": 0.16220472440944883,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018522125901654363,
      "kl": 0.0015996306610759348,
      "learning_rate": 9.67568318666049e-07,
      "loss": 0.0001,
      "num_tokens": 96386197.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3502,
      "step_time": 18.0606238655746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 254.625,
      "completions/mean_terminated_length": 254.625,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "entropy": 0.22315406426787376,
      "epoch": 0.16225104214914313,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08620049804449081,
      "kl": 0.007499936851672828,
      "learning_rate": 9.675590551181101e-07,
      "loss": -0.0355,
      "num_tokens": 96418431.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3503,
      "step_time": 25.348066557198763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 138.5,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3328055292367935,
      "epoch": 0.16229735988883742,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003742165630683303,
      "kl": 0.0026473395992070436,
      "learning_rate": 9.675497915701713e-07,
      "loss": 0.0001,
      "num_tokens": 96441463.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3504,
      "step_time": 15.532160520553589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 145.1875,
      "completions/mean_terminated_length": 145.1875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.2995671108365059,
      "epoch": 0.16234367762853172,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004644697532057762,
      "kl": 0.002737248345511034,
      "learning_rate": 9.675405280222326e-07,
      "loss": 0.0001,
      "num_tokens": 96464122.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3505,
      "step_time": 15.495228987187147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 196.1875,
      "completions/mean_terminated_length": 196.1875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.44053924083709717,
      "epoch": 0.16238999536822604,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14676623046398163,
      "kl": 0.007510208641178906,
      "learning_rate": 9.675312644742935e-07,
      "loss": 0.125,
      "num_tokens": 96488045.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3506,
      "step_time": 24.72891664132476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 125.625,
      "completions/mean_terminated_length": 125.625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.31276872754096985,
      "epoch": 0.16243631310792034,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032454750034958124,
      "kl": 0.002006040042033419,
      "learning_rate": 9.675220009263546e-07,
      "loss": 0.0001,
      "num_tokens": 96516039.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3507,
      "step_time": 15.469757694751024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 139.1875,
      "completions/mean_terminated_length": 139.1875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2615719810128212,
      "epoch": 0.16248263084761463,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014279045863077044,
      "kl": 0.0014216804702300578,
      "learning_rate": 9.67512737378416e-07,
      "loss": 0.0001,
      "num_tokens": 96538010.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3508,
      "step_time": 14.813536275178194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 130.9375,
      "completions/mean_terminated_length": 130.9375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2843427509069443,
      "epoch": 0.16252894858730893,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002361882012337446,
      "kl": 0.002055309771094471,
      "learning_rate": 9.67503473830477e-07,
      "loss": 0.0001,
      "num_tokens": 96560105.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3509,
      "step_time": 14.728366624563932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 303.0,
      "completions/max_terminated_length": 303.0,
      "completions/mean_length": 200.25,
      "completions/mean_terminated_length": 200.25,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.22428186237812042,
      "epoch": 0.16257526632700325,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13002124428749084,
      "kl": 0.005687119672074914,
      "learning_rate": 9.674942102825382e-07,
      "loss": 0.1285,
      "num_tokens": 96582269.0,
      "reward": 0.481328547000885,
      "reward_std": 0.1283542811870575,
      "rewards/reward_func/mean": 0.481328547000885,
      "rewards/reward_func/std": 0.1283542811870575,
      "step": 3510,
      "step_time": 25.801049027591944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 153.0,
      "completions/mean_terminated_length": 153.0,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.352471724152565,
      "epoch": 0.16262158406669755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004096616059541702,
      "kl": 0.0030700730276294053,
      "learning_rate": 9.674849467345994e-07,
      "loss": 0.0002,
      "num_tokens": 96604445.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3511,
      "step_time": 16.6450727134943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 138.4375,
      "completions/mean_terminated_length": 138.4375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.23251745104789734,
      "epoch": 0.16266790180639185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00755663076415658,
      "kl": 0.005337439710274339,
      "learning_rate": 9.674756831866605e-07,
      "loss": 0.0003,
      "num_tokens": 96627044.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3512,
      "step_time": 16.149804331362247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 168.625,
      "completions/mean_terminated_length": 168.625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.38153017312288284,
      "epoch": 0.16271421954608614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007727692369371653,
      "kl": 0.006425941362977028,
      "learning_rate": 9.674664196387216e-07,
      "loss": 0.0003,
      "num_tokens": 96649710.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3513,
      "step_time": 19.649431314319372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 186.0,
      "completions/mean_terminated_length": 186.0,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.38822100311517715,
      "epoch": 0.16276053728578047,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037169994320720434,
      "kl": 0.002981725672725588,
      "learning_rate": 9.674571560907827e-07,
      "loss": 0.0002,
      "num_tokens": 96676782.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3514,
      "step_time": 21.991681169718504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 125.375,
      "completions/mean_terminated_length": 125.375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.33966681361198425,
      "epoch": 0.16280685502547476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031428884249180555,
      "kl": 0.0028423998737707734,
      "learning_rate": 9.674478925428439e-07,
      "loss": 0.0001,
      "num_tokens": 96699524.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3515,
      "step_time": 14.928597826510668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 198.375,
      "completions/mean_terminated_length": 198.375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.2440076507627964,
      "epoch": 0.16285317276516906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011665847152471542,
      "kl": 0.006670278729870915,
      "learning_rate": 9.67438628994905e-07,
      "loss": 0.0003,
      "num_tokens": 96723242.0,
      "reward": 0.5795782804489136,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5795782804489136,
      "rewards/reward_func/std": 0.0,
      "step": 3516,
      "step_time": 23.105410888791084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 160.25,
      "completions/mean_terminated_length": 160.25,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.1922718994319439,
      "epoch": 0.16289949050486335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006182128097862005,
      "kl": 0.0038107127184048295,
      "learning_rate": 9.67429365446966e-07,
      "loss": 0.0002,
      "num_tokens": 96751166.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 3517,
      "step_time": 19.49392169341445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 139.9375,
      "completions/mean_terminated_length": 139.9375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.35049815475940704,
      "epoch": 0.16294580824455768,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002779029542580247,
      "kl": 0.002253408427350223,
      "learning_rate": 9.674201018990272e-07,
      "loss": 0.0001,
      "num_tokens": 96777677.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3518,
      "step_time": 17.656462874263525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.2003810554742813,
      "epoch": 0.16299212598425197,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014294126071035862,
      "kl": 0.06613391451537609,
      "learning_rate": 9.674108383510884e-07,
      "loss": 0.0033,
      "num_tokens": 96807390.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3519,
      "step_time": 19.170617293566465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 127.3125,
      "completions/mean_terminated_length": 127.3125,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.29370229691267014,
      "epoch": 0.16303844372394627,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004364653490483761,
      "kl": 0.002967514330521226,
      "learning_rate": 9.674015748031495e-07,
      "loss": 0.0001,
      "num_tokens": 96830963.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3520,
      "step_time": 14.375239446759224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 151.4375,
      "completions/mean_terminated_length": 151.4375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3580208495259285,
      "epoch": 0.16308476146364057,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033027513418346643,
      "kl": 0.002833028440363705,
      "learning_rate": 9.673923112552106e-07,
      "loss": 0.0001,
      "num_tokens": 96852346.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3521,
      "step_time": 16.44275674968958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 155.125,
      "completions/mean_terminated_length": 155.125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.18305179476737976,
      "epoch": 0.1631310792033349,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005787827540189028,
      "kl": 0.0037083630450069904,
      "learning_rate": 9.67383047707272e-07,
      "loss": 0.0002,
      "num_tokens": 96873340.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 3522,
      "step_time": 16.114144783467054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 172.8125,
      "completions/mean_terminated_length": 172.8125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2033519260585308,
      "epoch": 0.16317739694302918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004432092420756817,
      "kl": 0.0017462515970692039,
      "learning_rate": 9.67373784159333e-07,
      "loss": 0.0001,
      "num_tokens": 96908665.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 3523,
      "step_time": 20.865708526223898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 119.6875,
      "completions/mean_terminated_length": 119.6875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.27791858464479446,
      "epoch": 0.16322371468272348,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004077862948179245,
      "kl": 0.002710001775994897,
      "learning_rate": 9.673645206113942e-07,
      "loss": 0.0001,
      "num_tokens": 96932772.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3524,
      "step_time": 14.771175995469093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 131.625,
      "completions/mean_terminated_length": 131.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3206168934702873,
      "epoch": 0.16327003242241778,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002391499001532793,
      "kl": 0.0020895492634736,
      "learning_rate": 9.673552570634553e-07,
      "loss": 0.0001,
      "num_tokens": 96963678.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3525,
      "step_time": 16.159237887710333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 206.25,
      "completions/mean_terminated_length": 206.25,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.36402788758277893,
      "epoch": 0.1633163501621121,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09697293490171432,
      "kl": 0.008576249238103628,
      "learning_rate": 9.673459935155164e-07,
      "loss": -0.0083,
      "num_tokens": 96988690.0,
      "reward": 0.0159545186907053,
      "reward_std": 0.033227622509002686,
      "rewards/reward_func/mean": 0.0159545186907053,
      "rewards/reward_func/std": 0.033227622509002686,
      "step": 3526,
      "step_time": 22.216883279383183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 154.4375,
      "completions/mean_terminated_length": 154.4375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.29022638499736786,
      "epoch": 0.1633626679018064,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007445821538567543,
      "kl": 0.0036948389606550336,
      "learning_rate": 9.673367299675776e-07,
      "loss": 0.0002,
      "num_tokens": 97011161.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 3527,
      "step_time": 18.00516940653324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 135.875,
      "completions/mean_terminated_length": 135.875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.25457216054201126,
      "epoch": 0.1634089856415007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023652641102671623,
      "kl": 0.0014288907987065613,
      "learning_rate": 9.673274664196387e-07,
      "loss": 0.0001,
      "num_tokens": 97036583.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3528,
      "step_time": 15.217930767685175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 181.3125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.289127416908741,
      "epoch": 0.163455303381195,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1538974940776825,
      "kl": 0.013005726970732212,
      "learning_rate": 9.673182028716998e-07,
      "loss": -0.1096,
      "num_tokens": 97058172.0,
      "reward": 0.17225128412246704,
      "reward_std": 0.3080783188343048,
      "rewards/reward_func/mean": 0.17225128412246704,
      "rewards/reward_func/std": 0.3080783188343048,
      "step": 3529,
      "step_time": 20.44297206401825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 248.875,
      "completions/mean_terminated_length": 248.875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.34234559535980225,
      "epoch": 0.1635016211208893,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07779324054718018,
      "kl": 0.007530900416895747,
      "learning_rate": 9.67308939323761e-07,
      "loss": -0.1397,
      "num_tokens": 97085626.0,
      "reward": 0.37527552247047424,
      "reward_std": 0.4395437240600586,
      "rewards/reward_func/mean": 0.37527552247047424,
      "rewards/reward_func/std": 0.4395437240600586,
      "step": 3530,
      "step_time": 28.89034976810217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 129.5625,
      "completions/mean_terminated_length": 129.5625,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2742532603442669,
      "epoch": 0.1635479388605836,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032103965058922768,
      "kl": 0.0024091703817248344,
      "learning_rate": 9.67299675775822e-07,
      "loss": 0.0001,
      "num_tokens": 97106915.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3531,
      "step_time": 14.497124005109072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 130.5625,
      "completions/mean_terminated_length": 130.5625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.35865744948387146,
      "epoch": 0.1635942566002779,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035574487410485744,
      "kl": 0.0029727817163802683,
      "learning_rate": 9.672904122278832e-07,
      "loss": 0.0001,
      "num_tokens": 97128940.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3532,
      "step_time": 14.25592328235507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 239.375,
      "completions/mean_terminated_length": 239.375,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.31367600709199905,
      "epoch": 0.1636405743399722,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1216372549533844,
      "kl": 0.021039447281509638,
      "learning_rate": 9.672811486799443e-07,
      "loss": -0.1128,
      "num_tokens": 97154658.0,
      "reward": 0.4968356192111969,
      "reward_std": 0.5131920576095581,
      "rewards/reward_func/mean": 0.4968356192111969,
      "rewards/reward_func/std": 0.5131920576095581,
      "step": 3533,
      "step_time": 26.200247816741467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 124.5625,
      "completions/mean_terminated_length": 124.5625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2965153381228447,
      "epoch": 0.16368689207966652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004787066485732794,
      "kl": 0.002312621392775327,
      "learning_rate": 9.672718851320054e-07,
      "loss": 0.0001,
      "num_tokens": 97176571.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3534,
      "step_time": 14.440358653664589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 128.5625,
      "completions/mean_terminated_length": 128.5625,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.2939995676279068,
      "epoch": 0.16373320981936082,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003910813480615616,
      "kl": 0.0028675563517026603,
      "learning_rate": 9.672626215840668e-07,
      "loss": 0.0001,
      "num_tokens": 97196468.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3535,
      "step_time": 14.77380719780922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 125.625,
      "completions/mean_terminated_length": 125.625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.26630087196826935,
      "epoch": 0.16377952755905512,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031438334845006466,
      "kl": 0.0021875373204238713,
      "learning_rate": 9.67253358036128e-07,
      "loss": 0.0001,
      "num_tokens": 97217566.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3536,
      "step_time": 13.541229378432035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 120.8125,
      "completions/mean_terminated_length": 120.8125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.28888891637325287,
      "epoch": 0.1638258452987494,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005562159698456526,
      "kl": 0.0032723327167332172,
      "learning_rate": 9.672440944881888e-07,
      "loss": 0.0002,
      "num_tokens": 97238091.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3537,
      "step_time": 13.543254546821117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 190.8125,
      "completions/mean_terminated_length": 190.8125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.272205114364624,
      "epoch": 0.16387216303844374,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.03555941954255104,
      "kl": 0.014776590745896101,
      "learning_rate": 9.672348309402502e-07,
      "loss": 0.0007,
      "num_tokens": 97259752.0,
      "reward": 0.4191337525844574,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.4191337525844574,
      "rewards/reward_func/std": 0.0,
      "step": 3538,
      "step_time": 19.38827906176448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 243.9375,
      "completions/mean_terminated_length": 243.9375,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.2799694687128067,
      "epoch": 0.16391848077813803,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004785658325999975,
      "kl": 0.0038703237078152597,
      "learning_rate": 9.672255673923113e-07,
      "loss": 0.0002,
      "num_tokens": 97293255.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3539,
      "step_time": 25.595364157110453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 227.4375,
      "completions/mean_terminated_length": 227.4375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2612280547618866,
      "epoch": 0.16396479851783233,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07925572246313095,
      "kl": 0.015011367620900273,
      "learning_rate": 9.672163038443724e-07,
      "loss": -0.1021,
      "num_tokens": 97318830.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3540,
      "step_time": 23.21995610371232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 230.875,
      "completions/mean_terminated_length": 230.875,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "entropy": 0.29261116683483124,
      "epoch": 0.16401111625752662,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0067114802077412605,
      "kl": 0.006157987168990076,
      "learning_rate": 9.672070402964335e-07,
      "loss": 0.0003,
      "num_tokens": 97357020.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3541,
      "step_time": 25.231066454201937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 166.8125,
      "completions/mean_terminated_length": 166.8125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.46383872628211975,
      "epoch": 0.16405743399722095,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026960340328514576,
      "kl": 0.0028817428392358124,
      "learning_rate": 9.671977767484947e-07,
      "loss": 0.0001,
      "num_tokens": 97403401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3542,
      "step_time": 23.84604797139764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 163.0625,
      "completions/mean_terminated_length": 163.0625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2743274196982384,
      "epoch": 0.16410375173691524,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010367260314524174,
      "kl": 0.00846461532637477,
      "learning_rate": 9.671885132005558e-07,
      "loss": 0.0004,
      "num_tokens": 97427178.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3543,
      "step_time": 18.02756090834737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 167.375,
      "completions/mean_terminated_length": 167.375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.3865014314651489,
      "epoch": 0.16415006947660954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017552862409502268,
      "kl": 0.0021251430734992027,
      "learning_rate": 9.67179249652617e-07,
      "loss": 0.0001,
      "num_tokens": 97477280.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3544,
      "step_time": 25.08999341726303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 131.3125,
      "completions/mean_terminated_length": 131.3125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.292327456176281,
      "epoch": 0.16419638721630384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005096784792840481,
      "kl": 0.002694442169740796,
      "learning_rate": 9.67169986104678e-07,
      "loss": 0.0001,
      "num_tokens": 97499125.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3545,
      "step_time": 14.179843433201313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 118.5,
      "completions/mean_terminated_length": 118.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.24239921942353249,
      "epoch": 0.16424270495599816,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002874167403206229,
      "kl": 0.001770373055478558,
      "learning_rate": 9.671607225567392e-07,
      "loss": 0.0001,
      "num_tokens": 97519053.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3546,
      "step_time": 13.279848337173462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2715304493904114,
      "epoch": 0.16428902269569245,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024509415961802006,
      "kl": 0.002041867992375046,
      "learning_rate": 9.671514590088003e-07,
      "loss": 0.0001,
      "num_tokens": 97544321.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3547,
      "step_time": 16.602627348154783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 141.375,
      "completions/mean_terminated_length": 141.375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.4028412252664566,
      "epoch": 0.16433534043538675,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033263410441577435,
      "kl": 0.002346788882277906,
      "learning_rate": 9.671421954608616e-07,
      "loss": 0.0001,
      "num_tokens": 97565479.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3548,
      "step_time": 16.204454492777586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 167.6875,
      "completions/mean_terminated_length": 167.6875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.24278324842453003,
      "epoch": 0.16438165817508105,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0047670938074588776,
      "kl": 0.0036830284516327083,
      "learning_rate": 9.671329319129225e-07,
      "loss": 0.0002,
      "num_tokens": 97586722.0,
      "reward": 0.522855818271637,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.522855818271637,
      "rewards/reward_func/std": 0.0,
      "step": 3549,
      "step_time": 16.94256315380335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 237.9375,
      "completions/mean_terminated_length": 237.9375,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "entropy": 0.28508298844099045,
      "epoch": 0.16442797591477537,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09614391624927521,
      "kl": 0.005944852251559496,
      "learning_rate": 9.671236683649836e-07,
      "loss": -0.0285,
      "num_tokens": 97610209.0,
      "reward": 0.6619340181350708,
      "reward_std": 0.21096941828727722,
      "rewards/reward_func/mean": 0.6619340181350708,
      "rewards/reward_func/std": 0.21096940338611603,
      "step": 3550,
      "step_time": 23.33081215620041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 176.8125,
      "completions/mean_terminated_length": 176.8125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.385079562664032,
      "epoch": 0.16447429365446967,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036837179213762283,
      "kl": 0.003025463200174272,
      "learning_rate": 9.671144048170448e-07,
      "loss": 0.0002,
      "num_tokens": 97638302.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3551,
      "step_time": 19.04620984196663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.23403598368167877,
      "epoch": 0.16452061139416396,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040371473878622055,
      "kl": 0.0025438999000471085,
      "learning_rate": 9.671051412691061e-07,
      "loss": 0.0001,
      "num_tokens": 97658854.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3552,
      "step_time": 15.845664210617542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 194.9375,
      "completions/mean_terminated_length": 194.9375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.3761293590068817,
      "epoch": 0.16456692913385826,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07410187274217606,
      "kl": 0.006458802963607013,
      "learning_rate": 9.670958777211672e-07,
      "loss": 0.0233,
      "num_tokens": 97689733.0,
      "reward": 0.05592745915055275,
      "reward_std": 0.223709836602211,
      "rewards/reward_func/mean": 0.05592745915055275,
      "rewards/reward_func/std": 0.223709836602211,
      "step": 3553,
      "step_time": 21.835889488458633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 122.5625,
      "completions/mean_terminated_length": 122.5625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.20000215247273445,
      "epoch": 0.16461324687355258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025598048232495785,
      "kl": 0.001941290043760091,
      "learning_rate": 9.670866141732284e-07,
      "loss": 0.0001,
      "num_tokens": 97709070.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3554,
      "step_time": 13.468267250806093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 332.0,
      "completions/max_terminated_length": 332.0,
      "completions/mean_length": 269.875,
      "completions/mean_terminated_length": 269.875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2784581035375595,
      "epoch": 0.16465956461324688,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07169782370328903,
      "kl": 0.01579317357391119,
      "learning_rate": 9.670773506252895e-07,
      "loss": -0.1591,
      "num_tokens": 97732908.0,
      "reward": 0.6524466276168823,
      "reward_std": 0.33146098256111145,
      "rewards/reward_func/mean": 0.6524466276168823,
      "rewards/reward_func/std": 0.33146098256111145,
      "step": 3555,
      "step_time": 26.99260038509965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 130.6875,
      "completions/mean_terminated_length": 130.6875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.26143621653318405,
      "epoch": 0.16470588235294117,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0056390464305877686,
      "kl": 0.002976380579639226,
      "learning_rate": 9.670680870773506e-07,
      "loss": 0.0001,
      "num_tokens": 97754487.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3556,
      "step_time": 13.53805648908019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 177.0,
      "completions/mean_terminated_length": 177.0,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.36464864015579224,
      "epoch": 0.16475220009263547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003303313162177801,
      "kl": 0.002488663129042834,
      "learning_rate": 9.670588235294117e-07,
      "loss": 0.0001,
      "num_tokens": 97787639.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3557,
      "step_time": 20.25088758021593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 187.25,
      "completions/mean_terminated_length": 187.25,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.20431096479296684,
      "epoch": 0.1647985178323298,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001888621598482132,
      "kl": 0.0036209102836437523,
      "learning_rate": 9.670495599814729e-07,
      "loss": 0.0002,
      "num_tokens": 97815531.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 3558,
      "step_time": 21.710391961038113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 120.0625,
      "completions/mean_terminated_length": 120.0625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2611607015132904,
      "epoch": 0.1648448355720241,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003172141732648015,
      "kl": 0.0021691049623768777,
      "learning_rate": 9.67040296433534e-07,
      "loss": 0.0001,
      "num_tokens": 97835020.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3559,
      "step_time": 12.920670833438635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 137.1875,
      "completions/mean_terminated_length": 137.1875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.20644544437527657,
      "epoch": 0.16489115331171839,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003373936051502824,
      "kl": 0.0022112150327302516,
      "learning_rate": 9.670310328855951e-07,
      "loss": 0.0001,
      "num_tokens": 97855583.0,
      "reward": 1.6640468774513188e-14,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.6640468774513188e-14,
      "rewards/reward_func/std": 0.0,
      "step": 3560,
      "step_time": 16.511867452412844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 200.0625,
      "completions/mean_terminated_length": 200.0625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.38920268416404724,
      "epoch": 0.16493747105141268,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1139177531003952,
      "kl": 0.01539679802954197,
      "learning_rate": 9.670217693376562e-07,
      "loss": 0.0144,
      "num_tokens": 97886784.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 3561,
      "step_time": 24.593369621783495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 147.8125,
      "completions/mean_terminated_length": 147.8125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3605232983827591,
      "epoch": 0.164983788791107,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035274229012429714,
      "kl": 0.002665524953044951,
      "learning_rate": 9.670125057897174e-07,
      "loss": 0.0001,
      "num_tokens": 97924061.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3562,
      "step_time": 20.690412435680628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 141.5625,
      "completions/mean_terminated_length": 141.5625,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3203904703259468,
      "epoch": 0.1650301065308013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00446776719763875,
      "kl": 0.003133551625069231,
      "learning_rate": 9.670032422417785e-07,
      "loss": 0.0002,
      "num_tokens": 97945654.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3563,
      "step_time": 14.90017794445157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 120.1875,
      "completions/mean_terminated_length": 120.1875,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.3613254576921463,
      "epoch": 0.1650764242704956,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003320826217532158,
      "kl": 0.002470236911904067,
      "learning_rate": 9.669939786938396e-07,
      "loss": 0.0001,
      "num_tokens": 97972617.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3564,
      "step_time": 15.292906329035759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 168.0625,
      "completions/mean_terminated_length": 168.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.28290047869086266,
      "epoch": 0.1651227420101899,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36806610226631165,
      "kl": 0.01397179055493325,
      "learning_rate": 9.66984715145901e-07,
      "loss": 0.0664,
      "num_tokens": 97997306.0,
      "reward": 0.5625,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.5625,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 3565,
      "step_time": 19.496180344372988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 155.9375,
      "completions/mean_terminated_length": 155.9375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3890189155936241,
      "epoch": 0.16516905974988422,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02676943875849247,
      "kl": 0.014399828622117639,
      "learning_rate": 9.66975451597962e-07,
      "loss": 0.0007,
      "num_tokens": 98032745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3566,
      "step_time": 22.030289605259895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 180.4375,
      "completions/mean_terminated_length": 180.4375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.1966230422258377,
      "epoch": 0.1652153774895785,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018266900442540646,
      "kl": 0.0015785284340381622,
      "learning_rate": 9.669661880500232e-07,
      "loss": 0.0001,
      "num_tokens": 98057072.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3567,
      "step_time": 19.08217379450798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 114.5,
      "completions/mean_terminated_length": 114.5,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2490151971578598,
      "epoch": 0.1652616952292728,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033037809189409018,
      "kl": 0.0022920635528862476,
      "learning_rate": 9.669569245020841e-07,
      "loss": 0.0001,
      "num_tokens": 98076936.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3568,
      "step_time": 13.264443390071392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 148.5625,
      "completions/mean_terminated_length": 148.5625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3315586596727371,
      "epoch": 0.1653080129689671,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023900519590824842,
      "kl": 0.002021096966927871,
      "learning_rate": 9.669476609541454e-07,
      "loss": 0.0001,
      "num_tokens": 98113073.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3569,
      "step_time": 19.52743361517787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 111.4375,
      "completions/mean_terminated_length": 111.4375,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.27076223492622375,
      "epoch": 0.16535433070866143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003763409797102213,
      "kl": 0.002294118225108832,
      "learning_rate": 9.669383974062066e-07,
      "loss": 0.0001,
      "num_tokens": 98132472.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3570,
      "step_time": 12.283430144190788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 210.9375,
      "completions/mean_terminated_length": 210.9375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.25906703248620033,
      "epoch": 0.16540064844835572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0987786054611206,
      "kl": 0.008897863561287522,
      "learning_rate": 9.669291338582677e-07,
      "loss": -0.0449,
      "num_tokens": 98157399.0,
      "reward": 0.32327309250831604,
      "reward_std": 0.15240254998207092,
      "rewards/reward_func/mean": 0.32327309250831604,
      "rewards/reward_func/std": 0.15240256488323212,
      "step": 3571,
      "step_time": 22.225793097168207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 191.5,
      "completions/mean_terminated_length": 191.5,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3547776862978935,
      "epoch": 0.16544696618805002,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0881710946559906,
      "kl": 0.011886299354955554,
      "learning_rate": 9.669198703103288e-07,
      "loss": -0.0213,
      "num_tokens": 98181199.0,
      "reward": 0.11778545379638672,
      "reward_std": 0.22982218861579895,
      "rewards/reward_func/mean": 0.11778545379638672,
      "rewards/reward_func/std": 0.22982218861579895,
      "step": 3572,
      "step_time": 21.435869842767715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 156.625,
      "completions/mean_terminated_length": 156.625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.435652494430542,
      "epoch": 0.16549328392774432,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016045479569584131,
      "kl": 0.0018821700941771269,
      "learning_rate": 9.6691060676239e-07,
      "loss": 0.0001,
      "num_tokens": 98231321.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3573,
      "step_time": 23.15938265994191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 278.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 192.5625,
      "completions/mean_terminated_length": 192.5625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.33999859541654587,
      "epoch": 0.16553960166743864,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1332770437002182,
      "kl": 0.007052879314869642,
      "learning_rate": 9.66901343214451e-07,
      "loss": -0.0199,
      "num_tokens": 98265714.0,
      "reward": 0.6569495797157288,
      "reward_std": 0.45744097232818604,
      "rewards/reward_func/mean": 0.6569495797157288,
      "rewards/reward_func/std": 0.45744097232818604,
      "step": 3574,
      "step_time": 26.043886370956898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 358.0,
      "completions/max_terminated_length": 358.0,
      "completions/mean_length": 228.1875,
      "completions/mean_terminated_length": 228.1875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.4236639142036438,
      "epoch": 0.16558591940713294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09896840155124664,
      "kl": 0.00724818604066968,
      "learning_rate": 9.668920796665122e-07,
      "loss": 0.1189,
      "num_tokens": 98290341.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3575,
      "step_time": 29.304279312491417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 199.1875,
      "completions/mean_terminated_length": 199.1875,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.28325093537569046,
      "epoch": 0.16563223714682723,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005324595142155886,
      "kl": 0.003494455828331411,
      "learning_rate": 9.668828161185733e-07,
      "loss": 0.0002,
      "num_tokens": 98316520.0,
      "reward": 0.084062859416008,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.084062859416008,
      "rewards/reward_func/std": 0.0,
      "step": 3576,
      "step_time": 20.90087179839611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 134.625,
      "completions/mean_terminated_length": 134.625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.3959891349077225,
      "epoch": 0.16567855488652153,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003521413542330265,
      "kl": 0.002813965082168579,
      "learning_rate": 9.668735525706344e-07,
      "loss": 0.0001,
      "num_tokens": 98340066.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3577,
      "step_time": 15.995197925716639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 181.125,
      "completions/mean_terminated_length": 181.125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.39261355996131897,
      "epoch": 0.16572487262621585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020177988335490227,
      "kl": 0.002152281114831567,
      "learning_rate": 9.668642890226958e-07,
      "loss": 0.0001,
      "num_tokens": 98380084.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3578,
      "step_time": 23.109325744211674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 183.1875,
      "completions/mean_terminated_length": 183.1875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.28571048378944397,
      "epoch": 0.16577119036591015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09259343892335892,
      "kl": 0.004429163585882634,
      "learning_rate": 9.66855025474757e-07,
      "loss": 0.0136,
      "num_tokens": 98407431.0,
      "reward": 0.8894338607788086,
      "reward_std": 0.2371823638677597,
      "rewards/reward_func/mean": 0.8894338607788086,
      "rewards/reward_func/std": 0.2371823638677597,
      "step": 3579,
      "step_time": 18.640504773706198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 203.375,
      "completions/mean_terminated_length": 203.375,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.1659403070807457,
      "epoch": 0.16581750810560444,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002949152607470751,
      "kl": 0.002286954957526177,
      "learning_rate": 9.668457619268178e-07,
      "loss": 0.0001,
      "num_tokens": 98433053.0,
      "reward": 0.9607894420623779,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9607894420623779,
      "rewards/reward_func/std": 0.0,
      "step": 3580,
      "step_time": 20.531835954636335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 132.625,
      "completions/mean_terminated_length": 132.625,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.2799430415034294,
      "epoch": 0.16586382584529874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002769128419458866,
      "kl": 0.0019599811639636755,
      "learning_rate": 9.66836498378879e-07,
      "loss": 0.0001,
      "num_tokens": 98452663.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3581,
      "step_time": 14.744931012392044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2276948243379593,
      "epoch": 0.16591014358499306,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004453351255506277,
      "kl": 0.002728738298173994,
      "learning_rate": 9.668272348309403e-07,
      "loss": 0.0001,
      "num_tokens": 98472351.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3582,
      "step_time": 13.851874709129333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 172.5625,
      "completions/mean_terminated_length": 172.5625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.3971063941717148,
      "epoch": 0.16595646132468736,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023131745401769876,
      "kl": 0.002333566138986498,
      "learning_rate": 9.668179712830014e-07,
      "loss": 0.0001,
      "num_tokens": 98518312.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3583,
      "step_time": 25.018486488610506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 157.125,
      "completions/mean_terminated_length": 157.125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.36475537717342377,
      "epoch": 0.16600277906438166,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022115667816251516,
      "kl": 0.0019968247215729207,
      "learning_rate": 9.668087077350625e-07,
      "loss": 0.0001,
      "num_tokens": 98546938.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3584,
      "step_time": 17.789002742618322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 123.625,
      "completions/mean_terminated_length": 123.625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.28959520161151886,
      "epoch": 0.16604909680407595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018079724395647645,
      "kl": 0.001498726400313899,
      "learning_rate": 9.667994441871237e-07,
      "loss": 0.0001,
      "num_tokens": 98569844.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3585,
      "step_time": 14.291997365653515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 215.625,
      "completions/mean_terminated_length": 215.625,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.2067979909479618,
      "epoch": 0.16609541454377028,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038332007825374603,
      "kl": 0.006445974169764668,
      "learning_rate": 9.667901806391848e-07,
      "loss": 0.0003,
      "num_tokens": 98602286.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3586,
      "step_time": 25.763326067477465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 191.8125,
      "completions/mean_terminated_length": 191.8125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.19510289654135704,
      "epoch": 0.16614173228346457,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1861986517906189,
      "kl": 0.007212674827314913,
      "learning_rate": 9.66780917091246e-07,
      "loss": -0.1037,
      "num_tokens": 98630843.0,
      "reward": 0.7634987831115723,
      "reward_std": 0.2769618332386017,
      "rewards/reward_func/mean": 0.7634987831115723,
      "rewards/reward_func/std": 0.2769618630409241,
      "step": 3587,
      "step_time": 23.240470733493567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 148.5625,
      "completions/mean_terminated_length": 148.5625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.35953081399202347,
      "epoch": 0.16618805002315887,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029343648348003626,
      "kl": 0.001841022924054414,
      "learning_rate": 9.66771653543307e-07,
      "loss": 0.0001,
      "num_tokens": 98666084.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3588,
      "step_time": 19.61584033817053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 148.6875,
      "completions/mean_terminated_length": 148.6875,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.3365582451224327,
      "epoch": 0.16623436776285316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048422785475850105,
      "kl": 0.002994633687194437,
      "learning_rate": 9.667623899953682e-07,
      "loss": 0.0002,
      "num_tokens": 98687599.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3589,
      "step_time": 17.233600221574306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 218.9375,
      "completions/mean_terminated_length": 218.9375,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.20218537375330925,
      "epoch": 0.1662806855025475,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10181840509176254,
      "kl": 0.00199402665020898,
      "learning_rate": 9.667531264474293e-07,
      "loss": -0.0012,
      "num_tokens": 98728046.0,
      "reward": 0.9775209426879883,
      "reward_std": 0.04832850396633148,
      "rewards/reward_func/mean": 0.9775209426879883,
      "rewards/reward_func/std": 0.048328500241041183,
      "step": 3590,
      "step_time": 25.279921278357506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 197.8125,
      "completions/mean_terminated_length": 197.8125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.22607450187206268,
      "epoch": 0.16632700324224178,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002626505447551608,
      "kl": 0.002227245655376464,
      "learning_rate": 9.667438628994904e-07,
      "loss": 0.0001,
      "num_tokens": 98753771.0,
      "reward": 0.19180183112621307,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.19180183112621307,
      "rewards/reward_func/std": 0.0,
      "step": 3591,
      "step_time": 21.07438062131405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.30981430411338806,
      "epoch": 0.16637332098193608,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032275815028697252,
      "kl": 0.0024592981790192425,
      "learning_rate": 9.667345993515515e-07,
      "loss": 0.0001,
      "num_tokens": 98789742.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3592,
      "step_time": 19.41781486943364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 211.5,
      "completions/mean_terminated_length": 211.5,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.20867987722158432,
      "epoch": 0.16641963872163038,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029367273673415184,
      "kl": 0.0023950578179210424,
      "learning_rate": 9.667253358036127e-07,
      "loss": 0.0001,
      "num_tokens": 98827558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3593,
      "step_time": 24.983286380767822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 199.0625,
      "completions/mean_terminated_length": 199.0625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.2755452021956444,
      "epoch": 0.1664659564613247,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1034320741891861,
      "kl": 0.017279275692999363,
      "learning_rate": 9.667160722556738e-07,
      "loss": 0.0045,
      "num_tokens": 98848839.0,
      "reward": 0.9962133169174194,
      "reward_std": 0.015146732330322266,
      "rewards/reward_func/mean": 0.9962133169174194,
      "rewards/reward_func/std": 0.015146732330322266,
      "step": 3594,
      "step_time": 22.22673163190484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 316.0,
      "completions/max_terminated_length": 316.0,
      "completions/mean_length": 239.875,
      "completions/mean_terminated_length": 239.875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.40091605484485626,
      "epoch": 0.166512274201019,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0701683908700943,
      "kl": 0.011251306626945734,
      "learning_rate": 9.667068087077351e-07,
      "loss": -0.1091,
      "num_tokens": 98879765.0,
      "reward": 0.1425301730632782,
      "reward_std": 0.21833959221839905,
      "rewards/reward_func/mean": 0.1425301730632782,
      "rewards/reward_func/std": 0.21833959221839905,
      "step": 3595,
      "step_time": 27.728194940835238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 219.6875,
      "completions/mean_terminated_length": 219.6875,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.19141894206404686,
      "epoch": 0.1665585919407133,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003598688868805766,
      "kl": 0.0027538148569874465,
      "learning_rate": 9.666975451597962e-07,
      "loss": 0.0001,
      "num_tokens": 98911760.0,
      "reward": 0.9560167789459229,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9560167789459229,
      "rewards/reward_func/std": 0.0,
      "step": 3596,
      "step_time": 23.942966651171446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 186.5,
      "completions/mean_terminated_length": 186.5,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.33885566890239716,
      "epoch": 0.1666049096804076,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030243494547903538,
      "kl": 0.002889836789108813,
      "learning_rate": 9.666882816118574e-07,
      "loss": 0.0001,
      "num_tokens": 98948744.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3597,
      "step_time": 22.082825370132923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 149.125,
      "completions/mean_terminated_length": 149.125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.37782856822013855,
      "epoch": 0.1666512274201019,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038001537322998047,
      "kl": 0.002931522554717958,
      "learning_rate": 9.666790180639183e-07,
      "loss": 0.0001,
      "num_tokens": 98985018.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3598,
      "step_time": 19.916902281343937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 132.9375,
      "completions/mean_terminated_length": 132.9375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.30282729864120483,
      "epoch": 0.1666975451597962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020080318208783865,
      "kl": 0.0018979228334501386,
      "learning_rate": 9.666697545159796e-07,
      "loss": 0.0001,
      "num_tokens": 99021065.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3599,
      "step_time": 18.39753397554159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 195.6875,
      "completions/mean_terminated_length": 195.6875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.4366292282938957,
      "epoch": 0.1667438628994905,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00261685810983181,
      "kl": 0.0029804747900925577,
      "learning_rate": 9.666604909680407e-07,
      "loss": 0.0002,
      "num_tokens": 99046276.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3600,
      "step_time": 20.289067335426807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 186.4375,
      "completions/mean_terminated_length": 186.4375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.33161909133195877,
      "epoch": 0.1667901806391848,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005478667560964823,
      "kl": 0.004064594686497003,
      "learning_rate": 9.666512274201019e-07,
      "loss": 0.0002,
      "num_tokens": 99073227.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 3601,
      "step_time": 21.90181877836585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 187.0625,
      "completions/mean_terminated_length": 187.0625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.27226074039936066,
      "epoch": 0.16683649837887912,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15579216182231903,
      "kl": 0.00792406452819705,
      "learning_rate": 9.66641963872163e-07,
      "loss": 0.0364,
      "num_tokens": 99094508.0,
      "reward": 0.6438685059547424,
      "reward_std": 0.17169827222824097,
      "rewards/reward_func/mean": 0.6438685059547424,
      "rewards/reward_func/std": 0.17169827222824097,
      "step": 3602,
      "step_time": 20.344610940665007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 171.25,
      "completions/mean_terminated_length": 171.25,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.2098471112549305,
      "epoch": 0.16688281611857342,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002205336233600974,
      "kl": 0.001868272986030206,
      "learning_rate": 9.666327003242241e-07,
      "loss": 0.0001,
      "num_tokens": 99119488.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3603,
      "step_time": 18.055586989969015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 224.5,
      "completions/mean_terminated_length": 224.5,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.3071671575307846,
      "epoch": 0.16692913385826771,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13708633184432983,
      "kl": 0.009593281196430326,
      "learning_rate": 9.666234367762852e-07,
      "loss": -0.0212,
      "num_tokens": 99157480.0,
      "reward": 0.7879804968833923,
      "reward_std": 0.013710908591747284,
      "rewards/reward_func/mean": 0.7879804968833923,
      "rewards/reward_func/std": 0.013710916973650455,
      "step": 3604,
      "step_time": 25.048242699354887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 143.75,
      "completions/mean_terminated_length": 143.75,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.1587459072470665,
      "epoch": 0.166975451597962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048674545250833035,
      "kl": 0.002053667150903493,
      "learning_rate": 9.666141732283464e-07,
      "loss": 0.0001,
      "num_tokens": 99178468.0,
      "reward": 0.8446319699287415,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8446319699287415,
      "rewards/reward_func/std": 0.0,
      "step": 3605,
      "step_time": 15.05143042653799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 132.0,
      "completions/mean_terminated_length": 132.0,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2497384026646614,
      "epoch": 0.16702176933765633,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003642859635874629,
      "kl": 0.0023387824185192585,
      "learning_rate": 9.666049096804075e-07,
      "loss": 0.0001,
      "num_tokens": 99197972.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3606,
      "step_time": 13.351442039012909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 200.125,
      "completions/mean_terminated_length": 200.125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.4811240881681442,
      "epoch": 0.16706808707735063,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01033422164618969,
      "kl": 0.007307854946702719,
      "learning_rate": 9.665956461324686e-07,
      "loss": 0.0004,
      "num_tokens": 99222966.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3607,
      "step_time": 21.15530388429761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 231.5,
      "completions/mean_terminated_length": 231.5,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.4534684345126152,
      "epoch": 0.16711440481704493,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12001563608646393,
      "kl": 0.009250278701074421,
      "learning_rate": 9.6658638258453e-07,
      "loss": -0.1372,
      "num_tokens": 99261582.0,
      "reward": 0.00035374873550608754,
      "reward_std": 0.0014149949420243502,
      "rewards/reward_func/mean": 0.00035374873550608754,
      "rewards/reward_func/std": 0.0014149949420243502,
      "step": 3608,
      "step_time": 32.87519274279475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 182.25,
      "completions/mean_terminated_length": 182.25,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.36298853158950806,
      "epoch": 0.16716072255673922,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11058811843395233,
      "kl": 0.03417673148214817,
      "learning_rate": 9.66577119036591e-07,
      "loss": -0.122,
      "num_tokens": 99282530.0,
      "reward": 0.17713744938373566,
      "reward_std": 0.2361832857131958,
      "rewards/reward_func/mean": 0.17713744938373566,
      "rewards/reward_func/std": 0.2361832708120346,
      "step": 3609,
      "step_time": 22.58516302704811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 167.75,
      "completions/mean_terminated_length": 167.75,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3905542939901352,
      "epoch": 0.16720704029643355,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0064782253466546535,
      "kl": 0.002873776655178517,
      "learning_rate": 9.665678554886522e-07,
      "loss": 0.0001,
      "num_tokens": 99318494.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3610,
      "step_time": 23.65369301661849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 170.375,
      "completions/mean_terminated_length": 170.375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.17546479031443596,
      "epoch": 0.16725335803612784,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09214676171541214,
      "kl": 0.005759982392191887,
      "learning_rate": 9.665585919407131e-07,
      "loss": -0.0209,
      "num_tokens": 99343796.0,
      "reward": 0.8912093043327332,
      "reward_std": 0.0704302117228508,
      "rewards/reward_func/mean": 0.8912093043327332,
      "rewards/reward_func/std": 0.0704302042722702,
      "step": 3611,
      "step_time": 17.869517344981432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 194.9375,
      "completions/mean_terminated_length": 194.9375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.4239168018102646,
      "epoch": 0.16729967577582214,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007603461388498545,
      "kl": 0.006265804171562195,
      "learning_rate": 9.665493283927745e-07,
      "loss": 0.0003,
      "num_tokens": 99374947.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3612,
      "step_time": 21.957412358373404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 167.25,
      "completions/mean_terminated_length": 167.25,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.4286089539527893,
      "epoch": 0.16734599351551643,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020595502573996782,
      "kl": 0.002293061406817287,
      "learning_rate": 9.665400648448356e-07,
      "loss": 0.0001,
      "num_tokens": 99428343.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3613,
      "step_time": 26.00006529316306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 195.75,
      "completions/mean_terminated_length": 195.75,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.4116162583231926,
      "epoch": 0.16739231125521076,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008351681753993034,
      "kl": 0.006845270982012153,
      "learning_rate": 9.665308012968967e-07,
      "loss": 0.0003,
      "num_tokens": 99460259.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3614,
      "step_time": 22.543399397283792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 127.5,
      "completions/mean_terminated_length": 127.5,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.24369941279292107,
      "epoch": 0.16743862899490505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002607213333249092,
      "kl": 0.0017242001194972545,
      "learning_rate": 9.665215377489578e-07,
      "loss": 0.0001,
      "num_tokens": 99479803.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3615,
      "step_time": 13.694619506597519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 157.5625,
      "completions/mean_terminated_length": 157.5625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.21114365756511688,
      "epoch": 0.16748494673459935,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09268580377101898,
      "kl": 0.0046520818723365664,
      "learning_rate": 9.66512274201019e-07,
      "loss": 0.0244,
      "num_tokens": 99501812.0,
      "reward": 0.9900055527687073,
      "reward_std": 0.027310028672218323,
      "rewards/reward_func/mean": 0.9900055527687073,
      "rewards/reward_func/std": 0.027310030534863472,
      "step": 3616,
      "step_time": 17.534027237445116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 152.0625,
      "completions/mean_terminated_length": 152.0625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.2408118136227131,
      "epoch": 0.16753126447429365,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08825484663248062,
      "kl": 0.003742107772268355,
      "learning_rate": 9.6650301065308e-07,
      "loss": -0.021,
      "num_tokens": 99522821.0,
      "reward": 0.9185318946838379,
      "reward_std": 0.021724820137023926,
      "rewards/reward_func/mean": 0.9185318946838379,
      "rewards/reward_func/std": 0.021724820137023926,
      "step": 3617,
      "step_time": 15.844024267047644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 146.0625,
      "completions/mean_terminated_length": 146.0625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.4209437370300293,
      "epoch": 0.16757758221398797,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002499717753380537,
      "kl": 0.0021618661703541875,
      "learning_rate": 9.664937471051412e-07,
      "loss": 0.0001,
      "num_tokens": 99545862.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3618,
      "step_time": 16.462530065327883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 344.0,
      "completions/max_terminated_length": 344.0,
      "completions/mean_length": 215.1875,
      "completions/mean_terminated_length": 215.1875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.34370262175798416,
      "epoch": 0.16762389995368226,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1858927607536316,
      "kl": 0.010673115029931068,
      "learning_rate": 9.664844835572023e-07,
      "loss": 0.1613,
      "num_tokens": 99569609.0,
      "reward": 0.7000278234481812,
      "reward_std": 0.41826578974723816,
      "rewards/reward_func/mean": 0.7000278234481812,
      "rewards/reward_func/std": 0.41826578974723816,
      "step": 3619,
      "step_time": 28.33480976894498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 137.9375,
      "completions/mean_terminated_length": 137.9375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.17957430332899094,
      "epoch": 0.16767021769337656,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36428120732307434,
      "kl": 0.010026685078628361,
      "learning_rate": 9.664752200092635e-07,
      "loss": 0.0142,
      "num_tokens": 99594056.0,
      "reward": 0.8488635420799255,
      "reward_std": 0.09884501993656158,
      "rewards/reward_func/mean": 0.8488635420799255,
      "rewards/reward_func/std": 0.09884503483772278,
      "step": 3620,
      "step_time": 15.420955941081047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 204.0625,
      "completions/mean_terminated_length": 204.0625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.33192652463912964,
      "epoch": 0.16771653543307086,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1426052302122116,
      "kl": 0.019870974589139223,
      "learning_rate": 9.664659564613246e-07,
      "loss": -0.0734,
      "num_tokens": 99616201.0,
      "reward": 0.10064637660980225,
      "reward_std": 0.10394712537527084,
      "rewards/reward_func/mean": 0.10064637660980225,
      "rewards/reward_func/std": 0.10394713282585144,
      "step": 3621,
      "step_time": 21.123180232942104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 186.25,
      "completions/mean_terminated_length": 186.25,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.3310723751783371,
      "epoch": 0.16776285317276518,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038498728536069393,
      "kl": 0.002794923959299922,
      "learning_rate": 9.66456692913386e-07,
      "loss": 0.0001,
      "num_tokens": 99643709.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3622,
      "step_time": 19.18281963467598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 194.1875,
      "completions/mean_terminated_length": 194.1875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.186534583568573,
      "epoch": 0.16780917091245948,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005528921727091074,
      "kl": 0.004269551369361579,
      "learning_rate": 9.664474293654468e-07,
      "loss": 0.0002,
      "num_tokens": 99670608.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3623,
      "step_time": 21.365838635712862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 151.875,
      "completions/mean_terminated_length": 151.875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.28029765188694,
      "epoch": 0.16785548865215377,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007722316309809685,
      "kl": 0.00351451471215114,
      "learning_rate": 9.66438165817508e-07,
      "loss": 0.0002,
      "num_tokens": 99691470.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3624,
      "step_time": 16.521307606250048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 150.0,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2888113483786583,
      "epoch": 0.16790180639184807,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003515079850330949,
      "kl": 0.0021059686259832233,
      "learning_rate": 9.664289022695693e-07,
      "loss": 0.0001,
      "num_tokens": 99716206.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3625,
      "step_time": 17.258567236363888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 203.1875,
      "completions/mean_terminated_length": 203.1875,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.19794563576579094,
      "epoch": 0.1679481241315424,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005172573961317539,
      "kl": 0.003150037897285074,
      "learning_rate": 9.664196387216304e-07,
      "loss": 0.0002,
      "num_tokens": 99747169.0,
      "reward": 0.6246347427368164,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6246347427368164,
      "rewards/reward_func/std": 0.0,
      "step": 3626,
      "step_time": 22.692824937403202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.13266050815582275,
      "epoch": 0.1679944418712367,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13565035164356232,
      "kl": 0.00248489883961156,
      "learning_rate": 9.664103751736915e-07,
      "loss": 0.0043,
      "num_tokens": 99768484.0,
      "reward": 0.9950882196426392,
      "reward_std": 0.01964726485311985,
      "rewards/reward_func/mean": 0.9950882196426392,
      "rewards/reward_func/std": 0.019647270441055298,
      "step": 3627,
      "step_time": 17.078135419636965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 189.4375,
      "completions/mean_terminated_length": 189.4375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.262265108525753,
      "epoch": 0.16804075961093098,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13999216258525848,
      "kl": 0.010434851050376892,
      "learning_rate": 9.664011116257527e-07,
      "loss": -0.0453,
      "num_tokens": 99793515.0,
      "reward": 0.5128942728042603,
      "reward_std": 0.30541178584098816,
      "rewards/reward_func/mean": 0.5128942728042603,
      "rewards/reward_func/std": 0.30541181564331055,
      "step": 3628,
      "step_time": 19.783077280968428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 150.6875,
      "completions/mean_terminated_length": 150.6875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4042646214365959,
      "epoch": 0.16808707735062528,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005343710537999868,
      "kl": 0.003096283588092774,
      "learning_rate": 9.663918480778138e-07,
      "loss": 0.0002,
      "num_tokens": 99834902.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3629,
      "step_time": 21.930745758116245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 232.625,
      "completions/mean_terminated_length": 232.625,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.355318583548069,
      "epoch": 0.1681333950903196,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10351262241601944,
      "kl": 0.015597585588693619,
      "learning_rate": 9.66382584529875e-07,
      "loss": -0.0117,
      "num_tokens": 99856944.0,
      "reward": 0.8472354412078857,
      "reward_std": 0.3326157331466675,
      "rewards/reward_func/mean": 0.8472354412078857,
      "rewards/reward_func/std": 0.3326157331466675,
      "step": 3630,
      "step_time": 23.895892221480608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 208.75,
      "completions/mean_terminated_length": 208.75,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.12586481124162674,
      "epoch": 0.1681797128300139,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0977628231048584,
      "kl": 0.002275153383379802,
      "learning_rate": 9.66373320981936e-07,
      "loss": -0.0957,
      "num_tokens": 99879692.0,
      "reward": 0.21041209995746613,
      "reward_std": 0.32232680916786194,
      "rewards/reward_func/mean": 0.21041209995746613,
      "rewards/reward_func/std": 0.3223268389701843,
      "step": 3631,
      "step_time": 21.60739677026868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.4112020879983902,
      "epoch": 0.1682260305697082,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027718129567801952,
      "kl": 0.0023315809085033834,
      "learning_rate": 9.663640574339972e-07,
      "loss": 0.0001,
      "num_tokens": 99918036.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3632,
      "step_time": 23.05859698727727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 186.375,
      "completions/mean_terminated_length": 186.375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3297823369503021,
      "epoch": 0.1682723483094025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1071990579366684,
      "kl": 0.015387247549369931,
      "learning_rate": 9.663547938860583e-07,
      "loss": -0.1113,
      "num_tokens": 99940170.0,
      "reward": 0.11924160271883011,
      "reward_std": 0.20211108028888702,
      "rewards/reward_func/mean": 0.11924160271883011,
      "rewards/reward_func/std": 0.20211108028888702,
      "step": 3633,
      "step_time": 22.06792050972581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 189.25,
      "completions/mean_terminated_length": 189.25,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.22469836100935936,
      "epoch": 0.16831866604909682,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004018844570964575,
      "kl": 0.003375634434632957,
      "learning_rate": 9.663455303381194e-07,
      "loss": 0.0002,
      "num_tokens": 99975582.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3634,
      "step_time": 21.8283023647964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 207.0625,
      "completions/mean_terminated_length": 207.0625,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.17764092236757278,
      "epoch": 0.1683649837887911,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00385016156360507,
      "kl": 0.008594031794928014,
      "learning_rate": 9.663362667901805e-07,
      "loss": 0.0004,
      "num_tokens": 100013503.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3635,
      "step_time": 23.138325460255146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.158108189702034,
      "epoch": 0.1684113015284854,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004413930233567953,
      "kl": 0.0019016998703591526,
      "learning_rate": 9.663270032422417e-07,
      "loss": 0.0001,
      "num_tokens": 100044399.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 3636,
      "step_time": 20.046655353158712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 153.9375,
      "completions/mean_terminated_length": 153.9375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.19572916254401207,
      "epoch": 0.1684576192681797,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004057406447827816,
      "kl": 0.003427662595640868,
      "learning_rate": 9.663177396943028e-07,
      "loss": 0.0002,
      "num_tokens": 100065246.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3637,
      "step_time": 15.902507115155458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 148.125,
      "completions/mean_terminated_length": 148.125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.24629414826631546,
      "epoch": 0.16850393700787403,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23255613446235657,
      "kl": 0.015454307897016406,
      "learning_rate": 9.66308476146364e-07,
      "loss": -0.0161,
      "num_tokens": 100101824.0,
      "reward": 0.42951178550720215,
      "reward_std": 0.009121465496718884,
      "rewards/reward_func/mean": 0.42951178550720215,
      "rewards/reward_func/std": 0.00912146270275116,
      "step": 3638,
      "step_time": 18.848948996514082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 298.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 238.25,
      "completions/mean_terminated_length": 238.25,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.3296635001897812,
      "epoch": 0.16855025474756832,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09846010059118271,
      "kl": 0.010080184903927147,
      "learning_rate": 9.662992125984252e-07,
      "loss": -0.101,
      "num_tokens": 100134484.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3639,
      "step_time": 26.790442250669003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 183.875,
      "completions/mean_terminated_length": 183.875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.40635445713996887,
      "epoch": 0.16859657248726262,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013727196492254734,
      "kl": 0.006187449675053358,
      "learning_rate": 9.662899490504864e-07,
      "loss": 0.0003,
      "num_tokens": 100165330.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3640,
      "step_time": 21.61533609032631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 166.1875,
      "completions/mean_terminated_length": 166.1875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.22182074561715126,
      "epoch": 0.16864289022695692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005874851252883673,
      "kl": 0.004019855405203998,
      "learning_rate": 9.662806855025473e-07,
      "loss": 0.0002,
      "num_tokens": 100190661.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 3641,
      "step_time": 19.146815598011017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 123.4375,
      "completions/mean_terminated_length": 123.4375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.33297599107027054,
      "epoch": 0.16868920796665124,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005786799360066652,
      "kl": 0.003008337924256921,
      "learning_rate": 9.662714219546086e-07,
      "loss": 0.0001,
      "num_tokens": 100215052.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3642,
      "step_time": 15.082947868853807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 185.875,
      "completions/mean_terminated_length": 185.875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.39536600559949875,
      "epoch": 0.16873552570634553,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0043201870284974575,
      "kl": 0.00370199978351593,
      "learning_rate": 9.662621584066697e-07,
      "loss": 0.0002,
      "num_tokens": 100258298.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3643,
      "step_time": 24.709015142172575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 185.8125,
      "completions/mean_terminated_length": 185.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4209500700235367,
      "epoch": 0.16878184344603983,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004707318264991045,
      "kl": 0.004353383614216,
      "learning_rate": 9.662528948587309e-07,
      "loss": 0.0002,
      "num_tokens": 100280759.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3644,
      "step_time": 19.487607669085264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 180.625,
      "completions/mean_terminated_length": 180.625,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3488072007894516,
      "epoch": 0.16882816118573413,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007075755391269922,
      "kl": 0.0051837723003700376,
      "learning_rate": 9.66243631310792e-07,
      "loss": 0.0003,
      "num_tokens": 100303057.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3645,
      "step_time": 19.613004866987467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 367.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 223.6875,
      "completions/mean_terminated_length": 223.6875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.3842403292655945,
      "epoch": 0.16887447892542845,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11126924306154251,
      "kl": 0.008895701728761196,
      "learning_rate": 9.662343677628531e-07,
      "loss": -0.2362,
      "num_tokens": 100340924.0,
      "reward": 0.16754452884197235,
      "reward_std": 0.3664451539516449,
      "rewards/reward_func/mean": 0.16754452884197235,
      "rewards/reward_func/std": 0.3664451539516449,
      "step": 3646,
      "step_time": 33.60752094164491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 138.9375,
      "completions/mean_terminated_length": 138.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.29095131158828735,
      "epoch": 0.16892079666512275,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004700443707406521,
      "kl": 0.002529663441237062,
      "learning_rate": 9.662251042149142e-07,
      "loss": 0.0001,
      "num_tokens": 100362155.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3647,
      "step_time": 16.778659086674452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 197.125,
      "completions/mean_terminated_length": 197.125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.19357973709702492,
      "epoch": 0.16896711440481704,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14569637179374695,
      "kl": 0.012966676848009229,
      "learning_rate": 9.662158406669754e-07,
      "loss": -0.028,
      "num_tokens": 100391261.0,
      "reward": 0.557201087474823,
      "reward_std": 0.054861608892679214,
      "rewards/reward_func/mean": 0.557201087474823,
      "rewards/reward_func/std": 0.05486161261796951,
      "step": 3648,
      "step_time": 20.71492462977767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 136.1875,
      "completions/mean_terminated_length": 136.1875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2660072445869446,
      "epoch": 0.16901343214451134,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01897454261779785,
      "kl": 0.008365912595763803,
      "learning_rate": 9.662065771190365e-07,
      "loss": 0.0004,
      "num_tokens": 100410864.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3649,
      "step_time": 14.980790104717016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 314.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 220.8125,
      "completions/mean_terminated_length": 220.8125,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.41321227699518204,
      "epoch": 0.16905974988420566,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1119815856218338,
      "kl": 0.011686134850606322,
      "learning_rate": 9.661973135710976e-07,
      "loss": -0.1593,
      "num_tokens": 100438685.0,
      "reward": 0.22011367976665497,
      "reward_std": 0.393751323223114,
      "rewards/reward_func/mean": 0.22011367976665497,
      "rewards/reward_func/std": 0.393751323223114,
      "step": 3650,
      "step_time": 27.304416824132204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 129.1875,
      "completions/mean_terminated_length": 129.1875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2917369455099106,
      "epoch": 0.16910606762389996,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027787622530013323,
      "kl": 0.002260442852275446,
      "learning_rate": 9.661880500231587e-07,
      "loss": 0.0001,
      "num_tokens": 100461184.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3651,
      "step_time": 14.18328521028161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 148.6875,
      "completions/mean_terminated_length": 148.6875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.393869049847126,
      "epoch": 0.16915238536359425,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025377217680215836,
      "kl": 0.0024148482480086386,
      "learning_rate": 9.6617878647522e-07,
      "loss": 0.0001,
      "num_tokens": 100506411.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3652,
      "step_time": 22.5497288107872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 168.25,
      "completions/mean_terminated_length": 168.25,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3528295233845711,
      "epoch": 0.16919870310328855,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10889166593551636,
      "kl": 0.018985942471772432,
      "learning_rate": 9.661695229272812e-07,
      "loss": 0.0128,
      "num_tokens": 100527407.0,
      "reward": 0.21260768175125122,
      "reward_std": 0.393032044172287,
      "rewards/reward_func/mean": 0.21260768175125122,
      "rewards/reward_func/std": 0.393032044172287,
      "step": 3653,
      "step_time": 18.676404014229774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 118.5,
      "completions/mean_terminated_length": 118.5,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.34263648837804794,
      "epoch": 0.16924502084298287,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00307916896417737,
      "kl": 0.002494905376806855,
      "learning_rate": 9.661602593793421e-07,
      "loss": 0.0001,
      "num_tokens": 100552311.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3654,
      "step_time": 14.8729502633214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 180.25,
      "completions/mean_terminated_length": 180.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.34922515600919724,
      "epoch": 0.16929133858267717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002448687329888344,
      "kl": 0.0023555216030217707,
      "learning_rate": 9.661509958314035e-07,
      "loss": 0.0001,
      "num_tokens": 100580059.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3655,
      "step_time": 20.91957689449191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 227.9375,
      "completions/mean_terminated_length": 227.9375,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.18775209039449692,
      "epoch": 0.16933765632237147,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021545514464378357,
      "kl": 0.0016433187702205032,
      "learning_rate": 9.661417322834646e-07,
      "loss": 0.0001,
      "num_tokens": 100613370.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3656,
      "step_time": 24.909799750894308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 136.125,
      "completions/mean_terminated_length": 136.125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2396089732646942,
      "epoch": 0.16938397406206576,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004323905799537897,
      "kl": 0.002229051402537152,
      "learning_rate": 9.661324687355257e-07,
      "loss": 0.0001,
      "num_tokens": 100633020.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3657,
      "step_time": 14.211376182734966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 170.375,
      "completions/mean_terminated_length": 170.375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.16995025426149368,
      "epoch": 0.16943029180176009,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1598382443189621,
      "kl": 0.026814636774361134,
      "learning_rate": 9.661232051875868e-07,
      "loss": -0.0344,
      "num_tokens": 100665922.0,
      "reward": 0.7106248736381531,
      "reward_std": 0.33894577622413635,
      "rewards/reward_func/mean": 0.7106248736381531,
      "rewards/reward_func/std": 0.33894577622413635,
      "step": 3658,
      "step_time": 19.974772695451975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 126.8125,
      "completions/mean_terminated_length": 126.8125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.34412428736686707,
      "epoch": 0.16947660954145438,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038280535954982042,
      "kl": 0.0029077634681016207,
      "learning_rate": 9.66113941639648e-07,
      "loss": 0.0001,
      "num_tokens": 100701727.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3659,
      "step_time": 18.098359052091837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 130.375,
      "completions/mean_terminated_length": 130.375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.24189125001430511,
      "epoch": 0.16952292728114868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003285208949819207,
      "kl": 0.0021098259603604674,
      "learning_rate": 9.66104678091709e-07,
      "loss": 0.0001,
      "num_tokens": 100721317.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3660,
      "step_time": 14.200971778482199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 187.0,
      "completions/mean_terminated_length": 187.0,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.23702138662338257,
      "epoch": 0.16956924502084297,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12934033572673798,
      "kl": 0.0036044674343429506,
      "learning_rate": 9.660954145437702e-07,
      "loss": 0.0309,
      "num_tokens": 100742597.0,
      "reward": 0.984572172164917,
      "reward_std": 0.06171126291155815,
      "rewards/reward_func/mean": 0.984572172164917,
      "rewards/reward_func/std": 0.06171126663684845,
      "step": 3661,
      "step_time": 19.861062217503786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 172.5625,
      "completions/mean_terminated_length": 172.5625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.21542150154709816,
      "epoch": 0.1696155627605373,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006063431967049837,
      "kl": 0.0042796487687155604,
      "learning_rate": 9.660861509958313e-07,
      "loss": 0.0002,
      "num_tokens": 100765518.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 3662,
      "step_time": 17.872896548360586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 154.5625,
      "completions/mean_terminated_length": 154.5625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3327007219195366,
      "epoch": 0.1696618805002316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007003942970186472,
      "kl": 0.006100799655541778,
      "learning_rate": 9.660768874478925e-07,
      "loss": 0.0003,
      "num_tokens": 100787943.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3663,
      "step_time": 17.094124987721443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 256.125,
      "completions/mean_terminated_length": 256.125,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "entropy": 0.27463462203741074,
      "epoch": 0.1697081982399259,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10698054730892181,
      "kl": 0.0058757910737767816,
      "learning_rate": 9.660676238999536e-07,
      "loss": -0.0013,
      "num_tokens": 100815641.0,
      "reward": 0.9841635823249817,
      "reward_std": 0.04327329620718956,
      "rewards/reward_func/mean": 0.9841635823249817,
      "rewards/reward_func/std": 0.04327329248189926,
      "step": 3664,
      "step_time": 24.39543791860342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 137.0,
      "completions/mean_terminated_length": 137.0,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3345174938440323,
      "epoch": 0.16975451597962019,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004585576709359884,
      "kl": 0.002789916645269841,
      "learning_rate": 9.66058360352015e-07,
      "loss": 0.0001,
      "num_tokens": 100838185.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3665,
      "step_time": 15.527457643300295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 127.625,
      "completions/mean_terminated_length": 127.625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2437593974173069,
      "epoch": 0.1698008337193145,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007722876965999603,
      "kl": 0.0038712118403054774,
      "learning_rate": 9.660490968040758e-07,
      "loss": 0.0002,
      "num_tokens": 100857763.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3666,
      "step_time": 13.929988894611597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 211.75,
      "completions/mean_terminated_length": 211.75,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.28165485709905624,
      "epoch": 0.1698471514590088,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10015315562486649,
      "kl": 0.02622091630473733,
      "learning_rate": 9.66039833256137e-07,
      "loss": -0.0143,
      "num_tokens": 100880687.0,
      "reward": 0.8524489998817444,
      "reward_std": 0.19673466682434082,
      "rewards/reward_func/mean": 0.8524489998817444,
      "rewards/reward_func/std": 0.19673468172550201,
      "step": 3667,
      "step_time": 20.948937579989433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 131.625,
      "completions/mean_terminated_length": 131.625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.28634975105524063,
      "epoch": 0.1698934691987031,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003771155374124646,
      "kl": 0.002001322660362348,
      "learning_rate": 9.66030569708198e-07,
      "loss": 0.0001,
      "num_tokens": 100906169.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3668,
      "step_time": 15.294891849160194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 144.0625,
      "completions/mean_terminated_length": 144.0625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3732023164629936,
      "epoch": 0.1699397869383974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026728189550340176,
      "kl": 0.002181119954911992,
      "learning_rate": 9.660213061602594e-07,
      "loss": 0.0001,
      "num_tokens": 100929850.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3669,
      "step_time": 17.441290482878685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.2838490232825279,
      "epoch": 0.16998610467809172,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025254993233829737,
      "kl": 0.001794445444829762,
      "learning_rate": 9.660120426123205e-07,
      "loss": 0.0001,
      "num_tokens": 100951373.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3670,
      "step_time": 15.839925896376371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 120.875,
      "completions/mean_terminated_length": 120.875,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2890567481517792,
      "epoch": 0.17003242241778602,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031862810719758272,
      "kl": 0.002349153161048889,
      "learning_rate": 9.660027790643817e-07,
      "loss": 0.0001,
      "num_tokens": 100972187.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3671,
      "step_time": 13.070206925272942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 170.5625,
      "completions/mean_terminated_length": 170.5625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.33420466631650925,
      "epoch": 0.1700787401574803,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.025074990466237068,
      "kl": 0.015349670546129346,
      "learning_rate": 9.659935155164428e-07,
      "loss": 0.0007,
      "num_tokens": 100993668.0,
      "reward": 1.8562681702860573e-07,
      "reward_std": 7.425072681144229e-07,
      "rewards/reward_func/mean": 1.8562681702860573e-07,
      "rewards/reward_func/std": 7.425073249578418e-07,
      "step": 3672,
      "step_time": 17.41484932228923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 161.1875,
      "completions/mean_terminated_length": 161.1875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.26415394246578217,
      "epoch": 0.1701250578971746,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19075551629066467,
      "kl": 0.01063548936508596,
      "learning_rate": 9.65984251968504e-07,
      "loss": 0.003,
      "num_tokens": 101015687.0,
      "reward": 0.8836101293563843,
      "reward_std": 0.04517616704106331,
      "rewards/reward_func/mean": 0.8836101293563843,
      "rewards/reward_func/std": 0.045176174491643906,
      "step": 3673,
      "step_time": 17.412964086979628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 126.3125,
      "completions/mean_terminated_length": 126.3125,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.25147486478090286,
      "epoch": 0.17017137563686893,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002212977735325694,
      "kl": 0.0018555527785792947,
      "learning_rate": 9.65974988420565e-07,
      "loss": 0.0001,
      "num_tokens": 101037036.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3674,
      "step_time": 13.811803121119738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 206.3125,
      "completions/mean_terminated_length": 206.3125,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.25448210909962654,
      "epoch": 0.17021769337656323,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040047443471848965,
      "kl": 0.02641190541908145,
      "learning_rate": 9.659657248726262e-07,
      "loss": 0.0013,
      "num_tokens": 101071713.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3675,
      "step_time": 23.2487272284925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 132.5625,
      "completions/mean_terminated_length": 132.5625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.306539848446846,
      "epoch": 0.17026401111625752,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002413178561255336,
      "kl": 0.0018781186663545668,
      "learning_rate": 9.659564613246873e-07,
      "loss": 0.0001,
      "num_tokens": 101107530.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3676,
      "step_time": 17.817917369306087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 173.9375,
      "completions/mean_terminated_length": 173.9375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.1745246984064579,
      "epoch": 0.17031032885595182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027705396059900522,
      "kl": 0.0018873816588893533,
      "learning_rate": 9.659471977767484e-07,
      "loss": 0.0001,
      "num_tokens": 101144569.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3677,
      "step_time": 21.63655637949705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 151.8125,
      "completions/mean_terminated_length": 151.8125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.18667694926261902,
      "epoch": 0.17035664659564614,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23493200540542603,
      "kl": 0.004584392299875617,
      "learning_rate": 9.659379342288095e-07,
      "loss": -0.05,
      "num_tokens": 101168758.0,
      "reward": 0.017583835870027542,
      "reward_std": 0.030946951359510422,
      "rewards/reward_func/mean": 0.017583835870027542,
      "rewards/reward_func/std": 0.03094695322215557,
      "step": 3678,
      "step_time": 16.856909211724997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 142.25,
      "completions/mean_terminated_length": 142.25,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.27224060148000717,
      "epoch": 0.17040296433534044,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005826570559293032,
      "kl": 0.0031756037496961653,
      "learning_rate": 9.659286706808707e-07,
      "loss": 0.0002,
      "num_tokens": 101189050.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3679,
      "step_time": 16.656532626599073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 134.625,
      "completions/mean_terminated_length": 134.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.23219594731926918,
      "epoch": 0.17044928207503474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004386923275887966,
      "kl": 0.002785380696877837,
      "learning_rate": 9.659194071329318e-07,
      "loss": 0.0001,
      "num_tokens": 101209812.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3680,
      "step_time": 14.081293478608131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 163.25,
      "completions/mean_terminated_length": 163.25,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.37270666658878326,
      "epoch": 0.17049559981472903,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0055905235931277275,
      "kl": 0.004493484157137573,
      "learning_rate": 9.65910143584993e-07,
      "loss": 0.0002,
      "num_tokens": 101260568.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3681,
      "step_time": 24.699590887874365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 147.0625,
      "completions/mean_terminated_length": 147.0625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2637674883008003,
      "epoch": 0.17054191755442336,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007293649483472109,
      "kl": 0.004482871852815151,
      "learning_rate": 9.659008800370543e-07,
      "loss": 0.0002,
      "num_tokens": 101280649.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3682,
      "step_time": 16.569469437003136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 128.375,
      "completions/mean_terminated_length": 128.375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.25608301162719727,
      "epoch": 0.17058823529411765,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035667158663272858,
      "kl": 0.0022067642712499946,
      "learning_rate": 9.658916164891154e-07,
      "loss": 0.0001,
      "num_tokens": 101302383.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3683,
      "step_time": 13.727999657392502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 155.0,
      "completions/mean_terminated_length": 155.0,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.44421854615211487,
      "epoch": 0.17063455303381195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038980096578598022,
      "kl": 0.0033952242229133844,
      "learning_rate": 9.658823529411765e-07,
      "loss": 0.0002,
      "num_tokens": 101360431.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3684,
      "step_time": 25.657791543751955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 176.1875,
      "completions/mean_terminated_length": 176.1875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4523504003882408,
      "epoch": 0.17068087077350624,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030881785787642,
      "kl": 0.0030222403001971543,
      "learning_rate": 9.658730893932376e-07,
      "loss": 0.0002,
      "num_tokens": 101382098.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3685,
      "step_time": 20.41754274070263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 154.75,
      "completions/mean_terminated_length": 154.75,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.33699050545692444,
      "epoch": 0.17072718851320057,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036779611837118864,
      "kl": 0.0024129419471137226,
      "learning_rate": 9.658638258452988e-07,
      "loss": 0.0001,
      "num_tokens": 101403582.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3686,
      "step_time": 16.62067049369216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 145.75,
      "completions/mean_terminated_length": 145.75,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.2957451716065407,
      "epoch": 0.17077350625289486,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024708774872124195,
      "kl": 0.0018769590242300183,
      "learning_rate": 9.658545622973599e-07,
      "loss": 0.0001,
      "num_tokens": 101425530.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3687,
      "step_time": 15.382260516285896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.1754971146583557,
      "epoch": 0.17081982399258916,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021309982985258102,
      "kl": 0.0015650664572604,
      "learning_rate": 9.65845298749421e-07,
      "loss": 0.0001,
      "num_tokens": 101449542.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 3688,
      "step_time": 18.965327501296997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 146.5,
      "completions/mean_terminated_length": 146.5,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.18681560829281807,
      "epoch": 0.17086614173228346,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19526633620262146,
      "kl": 0.004680161306168884,
      "learning_rate": 9.658360352014821e-07,
      "loss": -0.0346,
      "num_tokens": 101470830.0,
      "reward": 0.6141525506973267,
      "reward_std": 0.2965758144855499,
      "rewards/reward_func/mean": 0.6141525506973267,
      "rewards/reward_func/std": 0.2965758144855499,
      "step": 3689,
      "step_time": 15.90220457687974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 135.125,
      "completions/mean_terminated_length": 135.125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.27682189643383026,
      "epoch": 0.17091245947197778,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002285691909492016,
      "kl": 0.0017480867099948227,
      "learning_rate": 9.658267716535433e-07,
      "loss": 0.0001,
      "num_tokens": 101493616.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3690,
      "step_time": 14.771844647824764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 177.75,
      "completions/mean_terminated_length": 177.75,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2852865308523178,
      "epoch": 0.17095877721167207,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11144199222326279,
      "kl": 0.020489394664764404,
      "learning_rate": 9.658175081056044e-07,
      "loss": -0.0358,
      "num_tokens": 101515948.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3691,
      "step_time": 18.764317836612463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 159.1875,
      "completions/mean_terminated_length": 159.1875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.15364742279052734,
      "epoch": 0.17100509495136637,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1287347376346588,
      "kl": 0.003039184375666082,
      "learning_rate": 9.658082445576655e-07,
      "loss": -0.0072,
      "num_tokens": 101539407.0,
      "reward": 0.3012182116508484,
      "reward_std": 0.01231658086180687,
      "rewards/reward_func/mean": 0.3012182116508484,
      "rewards/reward_func/std": 0.01231657899916172,
      "step": 3692,
      "step_time": 16.599547754973173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 150.25,
      "completions/mean_terminated_length": 150.25,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3196949064731598,
      "epoch": 0.17105141269106067,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010743924416601658,
      "kl": 0.004210477869492024,
      "learning_rate": 9.657989810097266e-07,
      "loss": 0.0002,
      "num_tokens": 101566963.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3693,
      "step_time": 17.0893935225904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 163.125,
      "completions/mean_terminated_length": 163.125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3706919550895691,
      "epoch": 0.171097730430755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.016289448365569115,
      "kl": 0.005931872874498367,
      "learning_rate": 9.657897174617878e-07,
      "loss": 0.0003,
      "num_tokens": 101612005.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3694,
      "step_time": 21.769167751073837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 258.0,
      "completions/mean_terminated_length": 258.0,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "entropy": 0.20919641852378845,
      "epoch": 0.1711440481704493,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006265141069889069,
      "kl": 0.008982607396319509,
      "learning_rate": 9.65780453913849e-07,
      "loss": 0.0005,
      "num_tokens": 101637365.0,
      "reward": 0.8914382457733154,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8914382457733154,
      "rewards/reward_func/std": 0.0,
      "step": 3695,
      "step_time": 25.146470360457897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 199.875,
      "completions/mean_terminated_length": 199.875,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.24084605649113655,
      "epoch": 0.17119036591014358,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01442259643226862,
      "kl": 0.013131072046235204,
      "learning_rate": 9.657711903659102e-07,
      "loss": 0.0007,
      "num_tokens": 101660931.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3696,
      "step_time": 21.33481117337942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 198.0625,
      "completions/mean_terminated_length": 198.0625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.2683481052517891,
      "epoch": 0.17123668364983788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012098944745957851,
      "kl": 0.011228818213567138,
      "learning_rate": 9.657619268179711e-07,
      "loss": 0.0006,
      "num_tokens": 101690884.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3697,
      "step_time": 21.216736134141684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 151.375,
      "completions/mean_terminated_length": 151.375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.32109837979078293,
      "epoch": 0.1712830013895322,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007906484417617321,
      "kl": 0.005550313624553382,
      "learning_rate": 9.657526632700323e-07,
      "loss": 0.0003,
      "num_tokens": 101712458.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3698,
      "step_time": 15.93731328472495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 178.6875,
      "completions/mean_terminated_length": 178.6875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3686307594180107,
      "epoch": 0.1713293191292265,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00804579071700573,
      "kl": 0.008447291096672416,
      "learning_rate": 9.657433997220936e-07,
      "loss": 0.0004,
      "num_tokens": 101733589.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3699,
      "step_time": 20.16174814477563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 167.0625,
      "completions/mean_terminated_length": 167.0625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.1832829974591732,
      "epoch": 0.1713756368689208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025956721510738134,
      "kl": 0.0017483013507444412,
      "learning_rate": 9.657341361741547e-07,
      "loss": 0.0001,
      "num_tokens": 101756982.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 3700,
      "step_time": 18.25501473993063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 137.25,
      "completions/mean_terminated_length": 137.25,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2978828251361847,
      "epoch": 0.1714219546086151,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004572514444589615,
      "kl": 0.001750871044350788,
      "learning_rate": 9.657248726262158e-07,
      "loss": 0.0001,
      "num_tokens": 101782522.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3701,
      "step_time": 16.17101848870516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 225.0,
      "completions/mean_terminated_length": 225.0,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.18214595690369606,
      "epoch": 0.17146827234830941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13575305044651031,
      "kl": 0.011036734678782523,
      "learning_rate": 9.65715609078277e-07,
      "loss": -0.0263,
      "num_tokens": 101820970.0,
      "reward": 0.835374116897583,
      "reward_std": 0.1148674339056015,
      "rewards/reward_func/mean": 0.835374116897583,
      "rewards/reward_func/std": 0.1148674339056015,
      "step": 3702,
      "step_time": 25.153350312262774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 121.6875,
      "completions/mean_terminated_length": 121.6875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.21703851595520973,
      "epoch": 0.1715145900880037,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034549045376479626,
      "kl": 0.002502273127902299,
      "learning_rate": 9.65706345530338e-07,
      "loss": 0.0001,
      "num_tokens": 101840629.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3703,
      "step_time": 13.370243936777115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 159.3125,
      "completions/mean_terminated_length": 159.3125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.17573610693216324,
      "epoch": 0.171560907827698,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019829643424600363,
      "kl": 0.0013354003604035825,
      "learning_rate": 9.656970819823992e-07,
      "loss": 0.0001,
      "num_tokens": 101870906.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 3704,
      "step_time": 19.326514348387718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 133.625,
      "completions/mean_terminated_length": 133.625,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.31791510432958603,
      "epoch": 0.1716072255673923,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002956633921712637,
      "kl": 0.0022645049612037838,
      "learning_rate": 9.656878184344603e-07,
      "loss": 0.0001,
      "num_tokens": 101906916.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3705,
      "step_time": 18.75690321996808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 289.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 204.9375,
      "completions/mean_terminated_length": 204.9375,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.3802553340792656,
      "epoch": 0.17165354330708663,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010592067614197731,
      "kl": 0.009749772492796183,
      "learning_rate": 9.656785548865215e-07,
      "loss": 0.0005,
      "num_tokens": 101931827.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3706,
      "step_time": 24.345446296036243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 185.875,
      "completions/mean_terminated_length": 185.875,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.17301489785313606,
      "epoch": 0.17169986104678092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006229810882359743,
      "kl": 0.0051842586835846305,
      "learning_rate": 9.656692913385826e-07,
      "loss": 0.0003,
      "num_tokens": 101964049.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 3707,
      "step_time": 21.264019422233105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 156.8125,
      "completions/mean_terminated_length": 156.8125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.39049182087183,
      "epoch": 0.17174617878647522,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004415351431816816,
      "kl": 0.002992833615280688,
      "learning_rate": 9.656600277906437e-07,
      "loss": 0.0001,
      "num_tokens": 101996990.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3708,
      "step_time": 19.036223154515028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 243.0625,
      "completions/mean_terminated_length": 243.0625,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.28115164488554,
      "epoch": 0.1717924965261695,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08259239792823792,
      "kl": 0.015964378137141466,
      "learning_rate": 9.656507642427048e-07,
      "loss": -0.0682,
      "num_tokens": 102021487.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3709,
      "step_time": 25.482900887727737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 188.75,
      "completions/mean_terminated_length": 188.75,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.21375613659620285,
      "epoch": 0.17183881426586384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005931749939918518,
      "kl": 0.0047469911514781415,
      "learning_rate": 9.65641500694766e-07,
      "loss": 0.0002,
      "num_tokens": 102046059.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3710,
      "step_time": 19.37825195491314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 139.3125,
      "completions/mean_terminated_length": 139.3125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2109673209488392,
      "epoch": 0.17188513200555813,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002921863691881299,
      "kl": 0.0015583883505314589,
      "learning_rate": 9.65632237146827e-07,
      "loss": 0.0001,
      "num_tokens": 102066224.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3711,
      "step_time": 15.059500459581614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 163.125,
      "completions/mean_terminated_length": 163.125,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.25227079540491104,
      "epoch": 0.17193144974525243,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004785130266100168,
      "kl": 0.011102706892415881,
      "learning_rate": 9.656229735988884e-07,
      "loss": 0.0006,
      "num_tokens": 102099698.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 3712,
      "step_time": 20.44314457848668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 202.8125,
      "completions/mean_terminated_length": 202.8125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.25471294671297073,
      "epoch": 0.17197776748494673,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004278079140931368,
      "kl": 0.003029489715117961,
      "learning_rate": 9.656137100509495e-07,
      "loss": 0.0002,
      "num_tokens": 102128703.0,
      "reward": 0.5044883489608765,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5044883489608765,
      "rewards/reward_func/std": 0.0,
      "step": 3713,
      "step_time": 20.81208061054349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 166.3125,
      "completions/mean_terminated_length": 166.3125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.13849686458706856,
      "epoch": 0.17202408522464105,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012530670501291752,
      "kl": 0.0008675205754116178,
      "learning_rate": 9.656044465030107e-07,
      "loss": 0.0,
      "num_tokens": 102166548.0,
      "reward": 0.8385766744613647,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8385766744613647,
      "rewards/reward_func/std": 0.0,
      "step": 3714,
      "step_time": 19.745044253766537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 133.5625,
      "completions/mean_terminated_length": 133.5625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.3137066662311554,
      "epoch": 0.17207040296433534,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008268541656434536,
      "kl": 0.0035453151213005185,
      "learning_rate": 9.655951829550718e-07,
      "loss": 0.0002,
      "num_tokens": 102187341.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3715,
      "step_time": 14.896617818623781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 122.125,
      "completions/mean_terminated_length": 122.125,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.3074679300189018,
      "epoch": 0.17211672070402964,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002979022217914462,
      "kl": 0.0019353233510628343,
      "learning_rate": 9.65585919407133e-07,
      "loss": 0.0001,
      "num_tokens": 102211311.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3716,
      "step_time": 14.561978124082088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 156.375,
      "completions/mean_terminated_length": 156.375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.30533620715141296,
      "epoch": 0.17216303844372394,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014913151040673256,
      "kl": 0.011204065987840295,
      "learning_rate": 9.65576655859194e-07,
      "loss": 0.0006,
      "num_tokens": 102232309.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3717,
      "step_time": 16.69642524048686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 121.375,
      "completions/mean_terminated_length": 121.375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2996649816632271,
      "epoch": 0.17220935618341826,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034739826805889606,
      "kl": 0.0024550510570406914,
      "learning_rate": 9.655673923112552e-07,
      "loss": 0.0001,
      "num_tokens": 102255083.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3718,
      "step_time": 14.627103310078382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 136.125,
      "completions/mean_terminated_length": 136.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.24994129687547684,
      "epoch": 0.17225567392311256,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003131583333015442,
      "kl": 0.001976861385628581,
      "learning_rate": 9.655581287633163e-07,
      "loss": 0.0001,
      "num_tokens": 102275805.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3719,
      "step_time": 14.621182892471552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 192.9375,
      "completions/mean_terminated_length": 192.9375,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.2310882769525051,
      "epoch": 0.17230199166280685,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030810649041086435,
      "kl": 0.0019817696302197874,
      "learning_rate": 9.655488652153774e-07,
      "loss": 0.0001,
      "num_tokens": 102330524.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3720,
      "step_time": 27.614696621894836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 162.75,
      "completions/mean_terminated_length": 162.75,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.39921774715185165,
      "epoch": 0.17234830940250115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016555432230234146,
      "kl": 0.002115784795023501,
      "learning_rate": 9.655396016674385e-07,
      "loss": 0.0001,
      "num_tokens": 102389976.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3721,
      "step_time": 27.416408576071262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 156.8125,
      "completions/mean_terminated_length": 156.8125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.19609655812382698,
      "epoch": 0.17239462714219547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019529511919245124,
      "kl": 0.00153403909644112,
      "learning_rate": 9.655303381194997e-07,
      "loss": 0.0001,
      "num_tokens": 102410581.0,
      "reward": 0.7026185393333435,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7026185393333435,
      "rewards/reward_func/std": 0.0,
      "step": 3722,
      "step_time": 16.58870291337371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 158.75,
      "completions/mean_terminated_length": 158.75,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3258315548300743,
      "epoch": 0.17244094488188977,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006843021605163813,
      "kl": 0.004411225905641913,
      "learning_rate": 9.655210745715608e-07,
      "loss": 0.0002,
      "num_tokens": 102436433.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3723,
      "step_time": 17.99579330161214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 174.5,
      "completions/mean_terminated_length": 174.5,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.38945455104112625,
      "epoch": 0.17248726262158406,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007077403832226992,
      "kl": 0.005766460788436234,
      "learning_rate": 9.65511811023622e-07,
      "loss": 0.0003,
      "num_tokens": 102463545.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3724,
      "step_time": 19.768442127853632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 155.25,
      "completions/mean_terminated_length": 155.25,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.4106160178780556,
      "epoch": 0.17253358036127836,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020999412517994642,
      "kl": 0.0023164761951193213,
      "learning_rate": 9.655025474756833e-07,
      "loss": 0.0001,
      "num_tokens": 102498701.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3725,
      "step_time": 19.501278955489397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 191.75,
      "completions/mean_terminated_length": 191.75,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.38444218039512634,
      "epoch": 0.17257989810097268,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005422711372375488,
      "kl": 0.003909895662218332,
      "learning_rate": 9.654932839277444e-07,
      "loss": 0.0002,
      "num_tokens": 102542169.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3726,
      "step_time": 24.480846971273422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 213.75,
      "completions/mean_terminated_length": 213.75,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.17081598192453384,
      "epoch": 0.17262621584066698,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026369125116616488,
      "kl": 0.002134683105396107,
      "learning_rate": 9.654840203798055e-07,
      "loss": 0.0001,
      "num_tokens": 102581845.0,
      "reward": 0.8507331609725952,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8507331609725952,
      "rewards/reward_func/std": 0.0,
      "step": 3727,
      "step_time": 23.693589121103287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.28495217114686966,
      "epoch": 0.17267253358036128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00954211875796318,
      "kl": 0.005100173759274185,
      "learning_rate": 9.654747568318664e-07,
      "loss": 0.0003,
      "num_tokens": 102603782.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3728,
      "step_time": 15.57606941089034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 148.375,
      "completions/mean_terminated_length": 148.375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.2724595218896866,
      "epoch": 0.17271885132005557,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004697760101407766,
      "kl": 0.0028598178178071976,
      "learning_rate": 9.654654932839278e-07,
      "loss": 0.0001,
      "num_tokens": 102625148.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3729,
      "step_time": 15.565168585628271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 190.125,
      "completions/mean_terminated_length": 190.125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.20056196674704552,
      "epoch": 0.1727651690597499,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004472844768315554,
      "kl": 0.01144473161548376,
      "learning_rate": 9.654562297359889e-07,
      "loss": 0.0006,
      "num_tokens": 102654686.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3730,
      "step_time": 20.77423469349742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 128.1875,
      "completions/mean_terminated_length": 128.1875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.248933307826519,
      "epoch": 0.1728114867994442,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004179767332971096,
      "kl": 0.0024145441129803658,
      "learning_rate": 9.6544696618805e-07,
      "loss": 0.0001,
      "num_tokens": 102675553.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3731,
      "step_time": 13.76466766744852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.21037736907601357,
      "epoch": 0.1728578045391385,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003996069543063641,
      "kl": 0.00316495617153123,
      "learning_rate": 9.654377026401111e-07,
      "loss": 0.0002,
      "num_tokens": 102703443.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3732,
      "step_time": 19.998630672693253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 160.3125,
      "completions/mean_terminated_length": 160.3125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.2855217680335045,
      "epoch": 0.17290412227883278,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16108264029026031,
      "kl": 0.022220322862267494,
      "learning_rate": 9.654284390921723e-07,
      "loss": -0.0756,
      "num_tokens": 102724536.0,
      "reward": 0.8462153673171997,
      "reward_std": 0.3358876705169678,
      "rewards/reward_func/mean": 0.8462153673171997,
      "rewards/reward_func/std": 0.3358876705169678,
      "step": 3733,
      "step_time": 18.034131448715925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 200.875,
      "completions/mean_terminated_length": 200.875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.36467454582452774,
      "epoch": 0.1729504400185271,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007738959975540638,
      "kl": 0.007918860763311386,
      "learning_rate": 9.654191755442334e-07,
      "loss": 0.0004,
      "num_tokens": 102753670.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3734,
      "step_time": 23.042970154434443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3759901747107506,
      "epoch": 0.1729967577582214,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005353032145649195,
      "kl": 0.004246300901286304,
      "learning_rate": 9.654099119962945e-07,
      "loss": 0.0002,
      "num_tokens": 102777931.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3735,
      "step_time": 17.61615765839815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 142.25,
      "completions/mean_terminated_length": 142.25,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.207004364579916,
      "epoch": 0.1730430754979157,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030513142701238394,
      "kl": 0.0021838270185980946,
      "learning_rate": 9.654006484483556e-07,
      "loss": 0.0001,
      "num_tokens": 102803551.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 3736,
      "step_time": 17.92522521317005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 388.0,
      "completions/max_terminated_length": 388.0,
      "completions/mean_length": 302.625,
      "completions/mean_terminated_length": 302.625,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.2569631487131119,
      "epoch": 0.17308939323761,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07253891229629517,
      "kl": 0.009775074431672692,
      "learning_rate": 9.653913849004168e-07,
      "loss": -0.1115,
      "num_tokens": 102833001.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3737,
      "step_time": 32.41397521272302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 144.375,
      "completions/mean_terminated_length": 144.375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.28750574588775635,
      "epoch": 0.17313571097730432,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025953215081244707,
      "kl": 0.0023064085689838976,
      "learning_rate": 9.653821213524779e-07,
      "loss": 0.0001,
      "num_tokens": 102856671.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3738,
      "step_time": 16.143691390752792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 139.5625,
      "completions/mean_terminated_length": 139.5625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3195468336343765,
      "epoch": 0.17318202871699862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003679557703435421,
      "kl": 0.0026374868175480515,
      "learning_rate": 9.653728578045392e-07,
      "loss": 0.0001,
      "num_tokens": 102877288.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3739,
      "step_time": 15.23041708394885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 117.3125,
      "completions/mean_terminated_length": 117.3125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2956790328025818,
      "epoch": 0.1732283464566929,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032189737539738417,
      "kl": 0.0019083183142356575,
      "learning_rate": 9.653635942566001e-07,
      "loss": 0.0001,
      "num_tokens": 102898493.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3740,
      "step_time": 13.946897964924574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 179.5,
      "completions/mean_terminated_length": 179.5,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.41823677718639374,
      "epoch": 0.1732746641963872,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003907694946974516,
      "kl": 0.0029561029514297843,
      "learning_rate": 9.653543307086613e-07,
      "loss": 0.0001,
      "num_tokens": 102940277.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3741,
      "step_time": 23.860097888857126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 118.9375,
      "completions/mean_terminated_length": 118.9375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3027636259794235,
      "epoch": 0.17332098193608153,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001629327773116529,
      "kl": 0.0017339542391709983,
      "learning_rate": 9.653450671607226e-07,
      "loss": 0.0001,
      "num_tokens": 102961412.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3742,
      "step_time": 13.017041232436895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 198.1875,
      "completions/mean_terminated_length": 198.1875,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.20047512650489807,
      "epoch": 0.17336729967577583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001798186800442636,
      "kl": 0.0012854207889176905,
      "learning_rate": 9.653358036127837e-07,
      "loss": 0.0001,
      "num_tokens": 102996583.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 3743,
      "step_time": 22.757054530084133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 135.5625,
      "completions/mean_terminated_length": 135.5625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3016543537378311,
      "epoch": 0.17341361741547012,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034233916085213423,
      "kl": 0.002116954739904031,
      "learning_rate": 9.653265400648448e-07,
      "loss": 0.0001,
      "num_tokens": 103018176.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3744,
      "step_time": 15.639807790517807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 405.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 259.9375,
      "completions/mean_terminated_length": 259.9375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.4923515170812607,
      "epoch": 0.17345993515516442,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07116729021072388,
      "kl": 0.010051576886326075,
      "learning_rate": 9.65317276516906e-07,
      "loss": -0.0145,
      "num_tokens": 103051183.0,
      "reward": 0.11555609852075577,
      "reward_std": 0.31404364109039307,
      "rewards/reward_func/mean": 0.11555609852075577,
      "rewards/reward_func/std": 0.3140436112880707,
      "step": 3745,
      "step_time": 34.470788452774286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 115.125,
      "completions/mean_terminated_length": 115.125,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.2561406195163727,
      "epoch": 0.17350625289485874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004937897901982069,
      "kl": 0.003161235072184354,
      "learning_rate": 9.65308012968967e-07,
      "loss": 0.0002,
      "num_tokens": 103070497.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3746,
      "step_time": 14.427247531712055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 140.9375,
      "completions/mean_terminated_length": 140.9375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3614794984459877,
      "epoch": 0.17355257063455304,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006700413767248392,
      "kl": 0.00337793497601524,
      "learning_rate": 9.652987494210282e-07,
      "loss": 0.0002,
      "num_tokens": 103101888.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3747,
      "step_time": 18.192476235330105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 148.1875,
      "completions/mean_terminated_length": 148.1875,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.16572047024965286,
      "epoch": 0.17359888837424733,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010025094263255596,
      "kl": 0.0083791270153597,
      "learning_rate": 9.652894858730893e-07,
      "loss": 0.0004,
      "num_tokens": 103124147.0,
      "reward": 0.6778095960617065,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6778095960617065,
      "rewards/reward_func/std": 0.0,
      "step": 3748,
      "step_time": 16.14195951446891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 153.0,
      "completions/mean_terminated_length": 153.0,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.19808354601264,
      "epoch": 0.17364520611394163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0061765736900269985,
      "kl": 0.004112478287424892,
      "learning_rate": 9.652802223251505e-07,
      "loss": 0.0002,
      "num_tokens": 103144947.0,
      "reward": 0.780767560005188,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.780767560005188,
      "rewards/reward_func/std": 0.0,
      "step": 3749,
      "step_time": 15.514467250555754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 269.0,
      "completions/max_terminated_length": 269.0,
      "completions/mean_length": 234.5625,
      "completions/mean_terminated_length": 234.5625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "entropy": 0.20365862175822258,
      "epoch": 0.17369152385363595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005523050203919411,
      "kl": 0.005299078766256571,
      "learning_rate": 9.652709587772116e-07,
      "loss": 0.0003,
      "num_tokens": 103168860.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3750,
      "step_time": 22.849873408675194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.23281722888350487,
      "epoch": 0.17373784159333025,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035154789220541716,
      "kl": 0.0028017257573083043,
      "learning_rate": 9.652616952292727e-07,
      "loss": 0.0001,
      "num_tokens": 103200529.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 3751,
      "step_time": 18.584865167737007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 180.4375,
      "completions/mean_terminated_length": 180.4375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.31790756434202194,
      "epoch": 0.17378415933302455,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09971493482589722,
      "kl": 0.0063216263661161065,
      "learning_rate": 9.652524316813338e-07,
      "loss": 0.0316,
      "num_tokens": 103222216.0,
      "reward": 0.8482850790023804,
      "reward_std": 0.2262093424797058,
      "rewards/reward_func/mean": 0.8482850790023804,
      "rewards/reward_func/std": 0.226209357380867,
      "step": 3752,
      "step_time": 20.639958526939154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 147.9375,
      "completions/mean_terminated_length": 147.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.29077449440956116,
      "epoch": 0.17383047707271884,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00254775770008564,
      "kl": 0.002161647193133831,
      "learning_rate": 9.65243168133395e-07,
      "loss": 0.0001,
      "num_tokens": 103245623.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3753,
      "step_time": 15.709707036614418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 167.875,
      "completions/mean_terminated_length": 167.875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.21284860745072365,
      "epoch": 0.17387679481241317,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007645934820175171,
      "kl": 0.024556422606110573,
      "learning_rate": 9.65233904585456e-07,
      "loss": 0.0012,
      "num_tokens": 103266277.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 3754,
      "step_time": 19.057133305817842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 143.875,
      "completions/mean_terminated_length": 143.875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2866797000169754,
      "epoch": 0.17392311255210746,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035950294695794582,
      "kl": 0.002579561376478523,
      "learning_rate": 9.652246410375174e-07,
      "loss": 0.0001,
      "num_tokens": 103288259.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3755,
      "step_time": 15.143626194447279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3602429926395416,
      "epoch": 0.17396943029180176,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00384682253934443,
      "kl": 0.0033572075772099197,
      "learning_rate": 9.652153774895786e-07,
      "loss": 0.0002,
      "num_tokens": 103334785.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3756,
      "step_time": 22.83459320664406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 167.4375,
      "completions/mean_terminated_length": 167.4375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.28992898762226105,
      "epoch": 0.17401574803149605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006469201762229204,
      "kl": 0.004892459022812545,
      "learning_rate": 9.652061139416397e-07,
      "loss": 0.0002,
      "num_tokens": 103359816.0,
      "reward": 0.694277822971344,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.694277822971344,
      "rewards/reward_func/std": 0.0,
      "step": 3757,
      "step_time": 18.62649766355753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 137.6875,
      "completions/mean_terminated_length": 137.6875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3078532963991165,
      "epoch": 0.17406206577119038,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0054520429112017155,
      "kl": 0.002901038737036288,
      "learning_rate": 9.651968503937006e-07,
      "loss": 0.0001,
      "num_tokens": 103383187.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3758,
      "step_time": 14.65495702624321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 129.6875,
      "completions/mean_terminated_length": 129.6875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2888607755303383,
      "epoch": 0.17410838351088467,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038695086259394884,
      "kl": 0.00270639342488721,
      "learning_rate": 9.65187586845762e-07,
      "loss": 0.0001,
      "num_tokens": 103403342.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3759,
      "step_time": 15.090375158935785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 134.875,
      "completions/mean_terminated_length": 134.875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3306100144982338,
      "epoch": 0.17415470125057897,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011807381175458431,
      "kl": 0.005443289061076939,
      "learning_rate": 9.65178323297823e-07,
      "loss": 0.0003,
      "num_tokens": 103431916.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3760,
      "step_time": 16.116522643715143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.21775106713175774,
      "epoch": 0.17420101899027327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031909984536468983,
      "kl": 0.002064206579234451,
      "learning_rate": 9.651690597498842e-07,
      "loss": 0.0001,
      "num_tokens": 103451748.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3761,
      "step_time": 15.601998090744019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 171.4375,
      "completions/mean_terminated_length": 171.4375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.1548592373728752,
      "epoch": 0.1742473367299676,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09636900573968887,
      "kl": 0.0011171667283633724,
      "learning_rate": 9.651597962019453e-07,
      "loss": 0.0067,
      "num_tokens": 103480795.0,
      "reward": 0.939050555229187,
      "reward_std": 0.0152902128174901,
      "rewards/reward_func/mean": 0.939050555229187,
      "rewards/reward_func/std": 0.015290215611457825,
      "step": 3762,
      "step_time": 18.289710648357868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 276.9375,
      "completions/mean_terminated_length": 276.9375,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "entropy": 0.28093042969703674,
      "epoch": 0.17429365446966189,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09751872718334198,
      "kl": 0.009608272463083267,
      "learning_rate": 9.651505326540064e-07,
      "loss": -0.0371,
      "num_tokens": 103520666.0,
      "reward": 0.857061505317688,
      "reward_std": 0.11662736535072327,
      "rewards/reward_func/mean": 0.857061505317688,
      "rewards/reward_func/std": 0.11662737280130386,
      "step": 3763,
      "step_time": 29.446891706436872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 138.6875,
      "completions/mean_terminated_length": 138.6875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.37999802827835083,
      "epoch": 0.17433997220935618,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009503517299890518,
      "kl": 0.005113681661896408,
      "learning_rate": 9.651412691060676e-07,
      "loss": 0.0003,
      "num_tokens": 103545717.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3764,
      "step_time": 16.105688523501158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 176.625,
      "completions/mean_terminated_length": 176.625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3866517096757889,
      "epoch": 0.17438628994905048,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031348082702606916,
      "kl": 0.0024833722854964435,
      "learning_rate": 9.651320055581287e-07,
      "loss": 0.0001,
      "num_tokens": 103577535.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3765,
      "step_time": 19.784521024674177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 199.75,
      "completions/mean_terminated_length": 199.75,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.46232395619153976,
      "epoch": 0.1744326076887448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006246357224881649,
      "kl": 0.004766554455272853,
      "learning_rate": 9.651227420101898e-07,
      "loss": 0.0002,
      "num_tokens": 103611643.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3766,
      "step_time": 24.84987773373723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 168.5,
      "completions/mean_terminated_length": 168.5,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.34058190137147903,
      "epoch": 0.1744789254284391,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005178524646908045,
      "kl": 0.003166097041685134,
      "learning_rate": 9.65113478462251e-07,
      "loss": 0.0002,
      "num_tokens": 103645011.0,
      "reward": 0.0024787522852420807,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0024787522852420807,
      "rewards/reward_func/std": 0.0,
      "step": 3767,
      "step_time": 21.10859439522028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 148.0625,
      "completions/mean_terminated_length": 148.0625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.40794871747493744,
      "epoch": 0.1745252431681334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030148965306580067,
      "kl": 0.002487560792360455,
      "learning_rate": 9.65104214914312e-07,
      "loss": 0.0001,
      "num_tokens": 103670820.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3768,
      "step_time": 16.841842528432608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 217.75,
      "completions/mean_terminated_length": 217.75,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.23108908906579018,
      "epoch": 0.1745715609078277,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09400482475757599,
      "kl": 0.00745700323022902,
      "learning_rate": 9.650949513663734e-07,
      "loss": 0.0083,
      "num_tokens": 103693104.0,
      "reward": 0.9030756950378418,
      "reward_std": 0.10626877844333649,
      "rewards/reward_func/mean": 0.9030756950378418,
      "rewards/reward_func/std": 0.10626878589391708,
      "step": 3769,
      "step_time": 20.494326047599316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 204.9375,
      "completions/mean_terminated_length": 204.9375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.24559643864631653,
      "epoch": 0.174617878647522,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0932474210858345,
      "kl": 0.021621070336550474,
      "learning_rate": 9.650856878184345e-07,
      "loss": -0.0401,
      "num_tokens": 103718927.0,
      "reward": 0.6158918142318726,
      "reward_std": 0.25134479999542236,
      "rewards/reward_func/mean": 0.6158918142318726,
      "rewards/reward_func/std": 0.25134482979774475,
      "step": 3770,
      "step_time": 21.515521958470345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 154.3125,
      "completions/mean_terminated_length": 154.3125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.14618897810578346,
      "epoch": 0.1746641963872163,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16064424812793732,
      "kl": 0.004377821343950927,
      "learning_rate": 9.650764242704954e-07,
      "loss": -0.0244,
      "num_tokens": 103756196.0,
      "reward": 0.4293261468410492,
      "reward_std": 0.0618918351829052,
      "rewards/reward_func/mean": 0.4293261468410492,
      "rewards/reward_func/std": 0.061891838908195496,
      "step": 3771,
      "step_time": 20.11259887367487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 295.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 172.9375,
      "completions/mean_terminated_length": 172.9375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.23949642106890678,
      "epoch": 0.1747105141269106,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00627810787409544,
      "kl": 0.005583815509453416,
      "learning_rate": 9.650671607225568e-07,
      "loss": 0.0003,
      "num_tokens": 103783011.0,
      "reward": 0.9534969329833984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9534969329833984,
      "rewards/reward_func/std": 0.0,
      "step": 3772,
      "step_time": 25.637911964207888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.30497778952121735,
      "epoch": 0.1747568318666049,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022148836869746447,
      "kl": 0.0017805375391617417,
      "learning_rate": 9.650578971746179e-07,
      "loss": 0.0001,
      "num_tokens": 103809563.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3773,
      "step_time": 16.14947897940874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.1669195331633091,
      "epoch": 0.17480314960629922,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09515602141618729,
      "kl": 0.002404117549303919,
      "learning_rate": 9.65048633626679e-07,
      "loss": -0.0007,
      "num_tokens": 103833859.0,
      "reward": 0.7707434892654419,
      "reward_std": 0.29473787546157837,
      "rewards/reward_func/mean": 0.7707434892654419,
      "rewards/reward_func/std": 0.29473790526390076,
      "step": 3774,
      "step_time": 19.48267900198698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 305.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 270.625,
      "completions/mean_terminated_length": 270.625,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "entropy": 0.18492351472377777,
      "epoch": 0.17484946734599352,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004487346392124891,
      "kl": 0.005292492685839534,
      "learning_rate": 9.650393700787401e-07,
      "loss": 0.0003,
      "num_tokens": 103860621.0,
      "reward": 0.8445175886154175,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8445175886154175,
      "rewards/reward_func/std": 0.0,
      "step": 3775,
      "step_time": 25.96353581547737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 330.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 227.3125,
      "completions/mean_terminated_length": 227.3125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.3666505515575409,
      "epoch": 0.17489578508568782,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10239812731742859,
      "kl": 0.013023757841438055,
      "learning_rate": 9.650301065308013e-07,
      "loss": -0.1897,
      "num_tokens": 103899666.0,
      "reward": 0.2180238515138626,
      "reward_std": 0.3339870870113373,
      "rewards/reward_func/mean": 0.2180238515138626,
      "rewards/reward_func/std": 0.33398711681365967,
      "step": 3776,
      "step_time": 32.03803377598524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 174.1875,
      "completions/mean_terminated_length": 174.1875,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.21427936851978302,
      "epoch": 0.1749421028253821,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031666303984820843,
      "kl": 0.0021426088642328978,
      "learning_rate": 9.650208429828624e-07,
      "loss": 0.0001,
      "num_tokens": 103936917.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 3777,
      "step_time": 21.710630007088184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 186.5625,
      "completions/mean_terminated_length": 186.5625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.42350853979587555,
      "epoch": 0.17498842056507644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006017395295202732,
      "kl": 0.004326632770244032,
      "learning_rate": 9.650115794349235e-07,
      "loss": 0.0002,
      "num_tokens": 103971358.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3778,
      "step_time": 22.553493205457926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 180.75,
      "completions/mean_terminated_length": 180.75,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2004511021077633,
      "epoch": 0.17503473830477073,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12609823048114777,
      "kl": 0.06032711360603571,
      "learning_rate": 9.650023158869846e-07,
      "loss": 0.024,
      "num_tokens": 103992970.0,
      "reward": 0.9580358266830444,
      "reward_std": 0.16785673797130585,
      "rewards/reward_func/mean": 0.9580358266830444,
      "rewards/reward_func/std": 0.16785675287246704,
      "step": 3779,
      "step_time": 19.642533019185066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 372.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 245.0625,
      "completions/mean_terminated_length": 245.0625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.36056920886039734,
      "epoch": 0.17508105604446503,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1281254142522812,
      "kl": 0.014152311254292727,
      "learning_rate": 9.649930523390458e-07,
      "loss": -0.1922,
      "num_tokens": 104032363.0,
      "reward": 0.3268398940563202,
      "reward_std": 0.33202239871025085,
      "rewards/reward_func/mean": 0.3268398940563202,
      "rewards/reward_func/std": 0.33202236890792847,
      "step": 3780,
      "step_time": 33.77379969879985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 168.9375,
      "completions/mean_terminated_length": 168.9375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3546975255012512,
      "epoch": 0.17512737378415932,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00474060233682394,
      "kl": 0.003740232961717993,
      "learning_rate": 9.649837887911069e-07,
      "loss": 0.0002,
      "num_tokens": 104052906.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3781,
      "step_time": 16.384960014373064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 163.0,
      "completions/mean_terminated_length": 163.0,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.16587354615330696,
      "epoch": 0.17517369152385365,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2559323012828827,
      "kl": 0.02330538514070213,
      "learning_rate": 9.649745252431682e-07,
      "loss": 0.0037,
      "num_tokens": 104076634.0,
      "reward": 0.6426084041595459,
      "reward_std": 0.16432462632656097,
      "rewards/reward_func/mean": 0.6426084041595459,
      "rewards/reward_func/std": 0.16432464122772217,
      "step": 3782,
      "step_time": 16.389335557818413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 180.9375,
      "completions/mean_terminated_length": 180.9375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.44991335272789,
      "epoch": 0.17522000926354794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007366468198597431,
      "kl": 0.005784983513876796,
      "learning_rate": 9.649652616952291e-07,
      "loss": 0.0003,
      "num_tokens": 104100793.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3783,
      "step_time": 21.555057518184185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.375,
      "completions/mean_terminated_length": 132.375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2406536191701889,
      "epoch": 0.17526632700324224,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029314288403838873,
      "kl": 0.001833113085012883,
      "learning_rate": 9.649559981472903e-07,
      "loss": 0.0001,
      "num_tokens": 104120319.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3784,
      "step_time": 13.804068814963102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 134.6875,
      "completions/mean_terminated_length": 134.6875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3224761486053467,
      "epoch": 0.17531264474293654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015656468458473682,
      "kl": 0.0014824274694547057,
      "learning_rate": 9.649467345993516e-07,
      "loss": 0.0001,
      "num_tokens": 104147322.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3785,
      "step_time": 15.190575946122408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 120.0625,
      "completions/mean_terminated_length": 120.0625,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.26283227279782295,
      "epoch": 0.17535896248263086,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004732994362711906,
      "kl": 0.0025724535225890577,
      "learning_rate": 9.649374710514127e-07,
      "loss": 0.0001,
      "num_tokens": 104166843.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3786,
      "step_time": 13.980366911739111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 195.4375,
      "completions/mean_terminated_length": 195.4375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.4102611541748047,
      "epoch": 0.17540528022232516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037792969960719347,
      "kl": 0.0041391217964701355,
      "learning_rate": 9.649282075034738e-07,
      "loss": 0.0002,
      "num_tokens": 104213282.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3787,
      "step_time": 24.304872281849384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 372.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 238.0,
      "completions/mean_terminated_length": 238.0,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.5076296031475067,
      "epoch": 0.17545159796201945,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10037500411272049,
      "kl": 0.008740164805203676,
      "learning_rate": 9.64918943955535e-07,
      "loss": 0.1476,
      "num_tokens": 104237106.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3788,
      "step_time": 30.68525357171893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 178.625,
      "completions/mean_terminated_length": 178.625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.16099544242024422,
      "epoch": 0.17549791570171375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004643074236810207,
      "kl": 0.003465694549959153,
      "learning_rate": 9.64909680407596e-07,
      "loss": 0.0002,
      "num_tokens": 104260924.0,
      "reward": 0.9591894745826721,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9591894745826721,
      "rewards/reward_func/std": 0.0,
      "step": 3789,
      "step_time": 19.04364400729537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 110.4375,
      "completions/mean_terminated_length": 110.4375,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.30926015973091125,
      "epoch": 0.17554423344140807,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003996169660240412,
      "kl": 0.0023928280570544302,
      "learning_rate": 9.649004168596572e-07,
      "loss": 0.0001,
      "num_tokens": 104281715.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3790,
      "step_time": 12.806249611079693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 118.375,
      "completions/mean_terminated_length": 118.375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2586973272264004,
      "epoch": 0.17559055118110237,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002421583281829953,
      "kl": 0.0017168657795991749,
      "learning_rate": 9.648911533117183e-07,
      "loss": 0.0001,
      "num_tokens": 104305753.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3791,
      "step_time": 13.677399579435587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 140.4375,
      "completions/mean_terminated_length": 140.4375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.28452108800411224,
      "epoch": 0.17563686892079666,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004345900844782591,
      "kl": 0.001882482465589419,
      "learning_rate": 9.648818897637795e-07,
      "loss": 0.0001,
      "num_tokens": 104327584.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3792,
      "step_time": 15.165101058781147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 168.75,
      "completions/mean_terminated_length": 168.75,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.25236088037490845,
      "epoch": 0.17568318666049096,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12061569839715958,
      "kl": 0.022313192021101713,
      "learning_rate": 9.648726262158406e-07,
      "loss": -0.0775,
      "num_tokens": 104350268.0,
      "reward": 0.4090709090232849,
      "reward_std": 0.2306751012802124,
      "rewards/reward_func/mean": 0.4090709090232849,
      "rewards/reward_func/std": 0.2306751012802124,
      "step": 3793,
      "step_time": 21.085606180131435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 153.4375,
      "completions/mean_terminated_length": 153.4375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.4003930240869522,
      "epoch": 0.17572950440018528,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002661115489900112,
      "kl": 0.0022960519418120384,
      "learning_rate": 9.648633626679017e-07,
      "loss": 0.0001,
      "num_tokens": 104385251.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3794,
      "step_time": 19.423167020082474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 117.125,
      "completions/mean_terminated_length": 117.125,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2657902389764786,
      "epoch": 0.17577582213987958,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003785985754802823,
      "kl": 0.0022969872516114265,
      "learning_rate": 9.648540991199628e-07,
      "loss": 0.0001,
      "num_tokens": 104405925.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3795,
      "step_time": 12.784420773386955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 147.625,
      "completions/mean_terminated_length": 147.625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3875296860933304,
      "epoch": 0.17582213987957387,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031278333626687527,
      "kl": 0.0026590877096168697,
      "learning_rate": 9.64844835572024e-07,
      "loss": 0.0001,
      "num_tokens": 104431727.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3796,
      "step_time": 16.57256530225277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 294.1875,
      "completions/mean_terminated_length": 294.1875,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.24025741964578629,
      "epoch": 0.17586845761926817,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07518387585878372,
      "kl": 0.014827840030193329,
      "learning_rate": 9.64835572024085e-07,
      "loss": -0.0995,
      "num_tokens": 104471890.0,
      "reward": 0.7640482187271118,
      "reward_std": 0.3236340582370758,
      "rewards/reward_func/mean": 0.7640482187271118,
      "rewards/reward_func/std": 0.3236340880393982,
      "step": 3797,
      "step_time": 30.441859886050224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 191.0,
      "completions/mean_terminated_length": 191.0,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.3019564151763916,
      "epoch": 0.1759147753589625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12074263393878937,
      "kl": 0.010099475388415158,
      "learning_rate": 9.648263084761462e-07,
      "loss": -0.0083,
      "num_tokens": 104494066.0,
      "reward": 0.962847113609314,
      "reward_std": 0.02586994878947735,
      "rewards/reward_func/mean": 0.962847113609314,
      "rewards/reward_func/std": 0.02586994506418705,
      "step": 3798,
      "step_time": 20.14840253815055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 323.0,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 236.5,
      "completions/mean_terminated_length": 236.5,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.44835473597049713,
      "epoch": 0.1759610930986568,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10527169704437256,
      "kl": 0.004380170023068786,
      "learning_rate": 9.648170449282076e-07,
      "loss": 0.124,
      "num_tokens": 104516666.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3799,
      "step_time": 26.33690120279789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 134.4375,
      "completions/mean_terminated_length": 134.4375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.35252758860588074,
      "epoch": 0.17600741083835109,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005306762643158436,
      "kl": 0.0034655986819416285,
      "learning_rate": 9.648077813802687e-07,
      "loss": 0.0002,
      "num_tokens": 104536689.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3800,
      "step_time": 14.679051656275988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 157.3125,
      "completions/mean_terminated_length": 157.3125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.24515363201498985,
      "epoch": 0.17605372857804538,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14069297909736633,
      "kl": 0.0238860035315156,
      "learning_rate": 9.647985178323296e-07,
      "loss": -0.0122,
      "num_tokens": 104558486.0,
      "reward": 0.464751660823822,
      "reward_std": 0.3110589385032654,
      "rewards/reward_func/mean": 0.464751660823822,
      "rewards/reward_func/std": 0.31105896830558777,
      "step": 3801,
      "step_time": 17.15539462864399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 187.375,
      "completions/mean_terminated_length": 187.375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.3115087226033211,
      "epoch": 0.1761000463177397,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30319005250930786,
      "kl": 0.023208333179354668,
      "learning_rate": 9.64789254284391e-07,
      "loss": -0.0291,
      "num_tokens": 104584700.0,
      "reward": 0.5386459827423096,
      "reward_std": 0.4325544536113739,
      "rewards/reward_func/mean": 0.5386459827423096,
      "rewards/reward_func/std": 0.4325544536113739,
      "step": 3802,
      "step_time": 21.620222613215446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 211.8125,
      "completions/mean_terminated_length": 211.8125,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.20918000489473343,
      "epoch": 0.176146364057434,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15997625887393951,
      "kl": 0.014598413603380322,
      "learning_rate": 9.64779990736452e-07,
      "loss": -0.1067,
      "num_tokens": 104623193.0,
      "reward": 0.5002998113632202,
      "reward_std": 0.29802316427230835,
      "rewards/reward_func/mean": 0.5002998113632202,
      "rewards/reward_func/std": 0.29802316427230835,
      "step": 3803,
      "step_time": 27.53459670767188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 138.0625,
      "completions/mean_terminated_length": 138.0625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.31225357949733734,
      "epoch": 0.1761926817971283,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003856898285448551,
      "kl": 0.0028518750332295895,
      "learning_rate": 9.647707271885132e-07,
      "loss": 0.0001,
      "num_tokens": 104644970.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3804,
      "step_time": 14.3464663811028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 160.3125,
      "completions/mean_terminated_length": 160.3125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.17767687886953354,
      "epoch": 0.1762389995368226,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013742530718445778,
      "kl": 0.00119293577154167,
      "learning_rate": 9.647614636405743e-07,
      "loss": 0.0001,
      "num_tokens": 104697951.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 3805,
      "step_time": 23.665065202862024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 121.0625,
      "completions/mean_terminated_length": 121.0625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.29276765137910843,
      "epoch": 0.17628531727651692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026806010864675045,
      "kl": 0.0021326115529518574,
      "learning_rate": 9.647522000926354e-07,
      "loss": 0.0001,
      "num_tokens": 104718336.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3806,
      "step_time": 13.176405761390924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 114.0,
      "completions/max_terminated_length": 114.0,
      "completions/mean_length": 100.1875,
      "completions/mean_terminated_length": 100.1875,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 0.29868993163108826,
      "epoch": 0.1763316350162112,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025214203633368015,
      "kl": 0.0015813722275197506,
      "learning_rate": 9.647429365446966e-07,
      "loss": 0.0001,
      "num_tokens": 104740259.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3807,
      "step_time": 12.031390871852636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 139.125,
      "completions/mean_terminated_length": 139.125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.2776578664779663,
      "epoch": 0.1763779527559055,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035993652418255806,
      "kl": 0.0025423296028748155,
      "learning_rate": 9.647336729967577e-07,
      "loss": 0.0001,
      "num_tokens": 104763749.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3808,
      "step_time": 15.653035368770361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 170.75,
      "completions/mean_terminated_length": 170.75,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.12160328961908817,
      "epoch": 0.1764242704955998,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005729260388761759,
      "kl": 0.005660561204422265,
      "learning_rate": 9.647244094488188e-07,
      "loss": 0.0003,
      "num_tokens": 104786385.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3809,
      "step_time": 16.994318760931492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 138.8125,
      "completions/mean_terminated_length": 138.8125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3268174082040787,
      "epoch": 0.17647058823529413,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0053449454717338085,
      "kl": 0.0030448375619016588,
      "learning_rate": 9.6471514590088e-07,
      "loss": 0.0002,
      "num_tokens": 104813918.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3810,
      "step_time": 17.80192095041275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 188.4375,
      "completions/mean_terminated_length": 188.4375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.27199970930814743,
      "epoch": 0.17651690597498843,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12092722207307816,
      "kl": 0.011022645980119705,
      "learning_rate": 9.64705882352941e-07,
      "loss": -0.1191,
      "num_tokens": 104835477.0,
      "reward": 0.30022096633911133,
      "reward_std": 0.4599035382270813,
      "rewards/reward_func/mean": 0.30022096633911133,
      "rewards/reward_func/std": 0.4599035382270813,
      "step": 3811,
      "step_time": 23.86600050330162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 164.0625,
      "completions/mean_terminated_length": 164.0625,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.428058497607708,
      "epoch": 0.17656322371468272,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022345797624439,
      "kl": 0.0023550239857286215,
      "learning_rate": 9.646966188050024e-07,
      "loss": 0.0001,
      "num_tokens": 104870454.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3812,
      "step_time": 20.78255834430456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 163.4375,
      "completions/mean_terminated_length": 163.4375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3997066244482994,
      "epoch": 0.17660954145437702,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022781190928071737,
      "kl": 0.002448283543344587,
      "learning_rate": 9.646873552570635e-07,
      "loss": 0.0001,
      "num_tokens": 104923053.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3813,
      "step_time": 24.927198097109795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 208.5625,
      "completions/mean_terminated_length": 208.5625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.44674404710531235,
      "epoch": 0.17665585919407134,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08394753187894821,
      "kl": 0.007794674369506538,
      "learning_rate": 9.646780917091244e-07,
      "loss": -0.0347,
      "num_tokens": 104963014.0,
      "reward": 0.00010168392327614129,
      "reward_std": 0.00040673569310456514,
      "rewards/reward_func/mean": 0.00010168392327614129,
      "rewards/reward_func/std": 0.0004067357222083956,
      "step": 3814,
      "step_time": 25.669972147792578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 167.0,
      "completions/mean_terminated_length": 167.0,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.16508163511753082,
      "epoch": 0.17670217693376564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001488029258325696,
      "kl": 0.001016217895084992,
      "learning_rate": 9.646688281611858e-07,
      "loss": 0.0001,
      "num_tokens": 105009382.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 3815,
      "step_time": 23.64604101702571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.17832287400960922,
      "epoch": 0.17674849467345993,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005954586435109377,
      "kl": 0.004308550618588924,
      "learning_rate": 9.64659564613247e-07,
      "loss": 0.0002,
      "num_tokens": 105051947.0,
      "reward": 0.9622687101364136,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9622687101364136,
      "rewards/reward_func/std": 0.0,
      "step": 3816,
      "step_time": 23.484891396015882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 148.1875,
      "completions/mean_terminated_length": 148.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.41103046387434006,
      "epoch": 0.17679481241315423,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023694795090705156,
      "kl": 0.002204233198426664,
      "learning_rate": 9.64650301065308e-07,
      "loss": 0.0001,
      "num_tokens": 105085278.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3817,
      "step_time": 18.35793075338006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 325.0,
      "completions/max_terminated_length": 325.0,
      "completions/mean_length": 242.125,
      "completions/mean_terminated_length": 242.125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.44671790301799774,
      "epoch": 0.17684113015284855,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09875843673944473,
      "kl": 0.006757065188139677,
      "learning_rate": 9.646410375173691e-07,
      "loss": 0.0929,
      "num_tokens": 105109472.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 3818,
      "step_time": 26.869915205985308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 135.5,
      "completions/mean_terminated_length": 135.5,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3203148618340492,
      "epoch": 0.17688744789254285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024329540319740772,
      "kl": 0.002018020866671577,
      "learning_rate": 9.646317739694303e-07,
      "loss": 0.0001,
      "num_tokens": 105131112.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3819,
      "step_time": 14.587321132421494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 264.9375,
      "completions/mean_terminated_length": 264.9375,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "entropy": 0.18735438585281372,
      "epoch": 0.17693376563223714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11418405920267105,
      "kl": 0.004360408231150359,
      "learning_rate": 9.646225104214914e-07,
      "loss": 0.0001,
      "num_tokens": 105170487.0,
      "reward": 0.9792701601982117,
      "reward_std": 0.05159161239862442,
      "rewards/reward_func/mean": 0.9792701601982117,
      "rewards/reward_func/std": 0.051591601222753525,
      "step": 3820,
      "step_time": 27.922769486904144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 154.6875,
      "completions/mean_terminated_length": 154.6875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.18828526511788368,
      "epoch": 0.17698008337193144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020355554297566414,
      "kl": 0.012405019253492355,
      "learning_rate": 9.646132468735525e-07,
      "loss": 0.0006,
      "num_tokens": 105193314.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 3821,
      "step_time": 19.056558277457952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 157.125,
      "completions/mean_terminated_length": 157.125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.2825987935066223,
      "epoch": 0.17702640111162576,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018772652838379145,
      "kl": 0.0016300764691550285,
      "learning_rate": 9.646039833256136e-07,
      "loss": 0.0001,
      "num_tokens": 105215508.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3822,
      "step_time": 17.652187269181013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 125.0625,
      "completions/mean_terminated_length": 125.0625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.24263647943735123,
      "epoch": 0.17707271885132006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00283534056507051,
      "kl": 0.001963997376151383,
      "learning_rate": 9.645947197776748e-07,
      "loss": 0.0001,
      "num_tokens": 105237653.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3823,
      "step_time": 14.83386355638504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 131.25,
      "completions/mean_terminated_length": 131.25,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.31897375732660294,
      "epoch": 0.17711903659101436,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00894392840564251,
      "kl": 0.004011940793134272,
      "learning_rate": 9.645854562297359e-07,
      "loss": 0.0002,
      "num_tokens": 105266265.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3824,
      "step_time": 15.94401427730918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 121.25,
      "completions/mean_terminated_length": 121.25,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.30347325652837753,
      "epoch": 0.17716535433070865,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005716771353036165,
      "kl": 0.002859598316717893,
      "learning_rate": 9.645761926817972e-07,
      "loss": 0.0001,
      "num_tokens": 105286285.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3825,
      "step_time": 14.27716787904501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 221.0,
      "completions/mean_terminated_length": 221.0,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "entropy": 0.21537592262029648,
      "epoch": 0.17721167207040298,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0053020622581243515,
      "kl": 0.003980011155363172,
      "learning_rate": 9.645669291338581e-07,
      "loss": 0.0002,
      "num_tokens": 105317613.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3826,
      "step_time": 23.069328784942627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 144.0,
      "completions/mean_terminated_length": 144.0,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.29018886387348175,
      "epoch": 0.17725798981009727,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030249236151576042,
      "kl": 0.0020077748922631145,
      "learning_rate": 9.645576655859193e-07,
      "loss": 0.0001,
      "num_tokens": 105353901.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3827,
      "step_time": 19.266102500259876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 184.6875,
      "completions/mean_terminated_length": 184.6875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.21682461351156235,
      "epoch": 0.17730430754979157,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009777910076081753,
      "kl": 0.009190604905597866,
      "learning_rate": 9.645484020379804e-07,
      "loss": 0.0005,
      "num_tokens": 105375112.0,
      "reward": 0.7589176297187805,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7589176297187805,
      "rewards/reward_func/std": 0.0,
      "step": 3828,
      "step_time": 19.212503224611282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 289.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 219.25,
      "completions/mean_terminated_length": 219.25,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.5217312127351761,
      "epoch": 0.17735062528948586,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10653503984212875,
      "kl": 0.009533015778288245,
      "learning_rate": 9.645391384900417e-07,
      "loss": -0.0397,
      "num_tokens": 105397036.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 3829,
      "step_time": 24.044645447283983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 199.5625,
      "completions/mean_terminated_length": 199.5625,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.29278120398521423,
      "epoch": 0.1773969430291802,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10548482090234756,
      "kl": 0.006157454918138683,
      "learning_rate": 9.645298749421029e-07,
      "loss": 0.0295,
      "num_tokens": 105433477.0,
      "reward": 0.8918563723564148,
      "reward_std": 0.2955045998096466,
      "rewards/reward_func/mean": 0.8918563723564148,
      "rewards/reward_func/std": 0.295504629611969,
      "step": 3830,
      "step_time": 23.892690498381853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 169.3125,
      "completions/mean_terminated_length": 169.3125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.27810313552618027,
      "epoch": 0.17744326076887448,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005729720462113619,
      "kl": 0.004573135694954544,
      "learning_rate": 9.64520611394164e-07,
      "loss": 0.0002,
      "num_tokens": 105454746.0,
      "reward": 0.24073302745819092,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.24073302745819092,
      "rewards/reward_func/std": 0.0,
      "step": 3831,
      "step_time": 17.41228273883462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 163.4375,
      "completions/mean_terminated_length": 163.4375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.12800641171634197,
      "epoch": 0.17748957850856878,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016617050860077143,
      "kl": 0.0012654691090574488,
      "learning_rate": 9.64511347846225e-07,
      "loss": 0.0001,
      "num_tokens": 105476273.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3832,
      "step_time": 16.543955411762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 136.75,
      "completions/mean_terminated_length": 136.75,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.189119603484869,
      "epoch": 0.17753589624826308,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006521198898553848,
      "kl": 0.003373265906702727,
      "learning_rate": 9.645020842982862e-07,
      "loss": 0.0002,
      "num_tokens": 105499885.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 3833,
      "step_time": 17.39403572306037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 165.0,
      "completions/mean_terminated_length": 165.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.36929506063461304,
      "epoch": 0.1775822139879574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0044189090840518475,
      "kl": 0.0037194338510744274,
      "learning_rate": 9.644928207503474e-07,
      "loss": 0.0002,
      "num_tokens": 105525821.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3834,
      "step_time": 17.734925776720047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 170.75,
      "completions/mean_terminated_length": 170.75,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.34637551009655,
      "epoch": 0.1776285317276517,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13116101920604706,
      "kl": 0.013561037834733725,
      "learning_rate": 9.644835572024085e-07,
      "loss": -0.0654,
      "num_tokens": 105546985.0,
      "reward": 0.17457427084445953,
      "reward_std": 0.37532341480255127,
      "rewards/reward_func/mean": 0.17457427084445953,
      "rewards/reward_func/std": 0.3753233850002289,
      "step": 3835,
      "step_time": 18.447752952575684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 168.8125,
      "completions/mean_terminated_length": 168.8125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.19035568088293076,
      "epoch": 0.177674849467346,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002532113343477249,
      "kl": 0.002177273971028626,
      "learning_rate": 9.644742936544696e-07,
      "loss": 0.0001,
      "num_tokens": 105569606.0,
      "reward": 0.5139832496643066,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5139832496643066,
      "rewards/reward_func/std": 0.0,
      "step": 3836,
      "step_time": 18.159806467592716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 154.5,
      "completions/mean_terminated_length": 154.5,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.17702282220125198,
      "epoch": 0.1777211672070403,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002190358005464077,
      "kl": 0.0014314647996798158,
      "learning_rate": 9.644650301065307e-07,
      "loss": 0.0001,
      "num_tokens": 105594878.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 3837,
      "step_time": 17.425941973924637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 237.5625,
      "completions/mean_terminated_length": 237.5625,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.1605139933526516,
      "epoch": 0.1777674849467346,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09193283319473267,
      "kl": 0.003897265298292041,
      "learning_rate": 9.644557665585919e-07,
      "loss": -0.0262,
      "num_tokens": 105622295.0,
      "reward": 0.7201932668685913,
      "reward_std": 0.004916071891784668,
      "rewards/reward_func/mean": 0.7201932668685913,
      "rewards/reward_func/std": 0.004916071891784668,
      "step": 3838,
      "step_time": 24.34452408924699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 136.625,
      "completions/mean_terminated_length": 136.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.33212994784116745,
      "epoch": 0.1778138026864289,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00414050230756402,
      "kl": 0.002761340991128236,
      "learning_rate": 9.64446503010653e-07,
      "loss": 0.0001,
      "num_tokens": 105647281.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3839,
      "step_time": 16.613259498029947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 137.9375,
      "completions/mean_terminated_length": 137.9375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.2578519880771637,
      "epoch": 0.1778601204261232,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004538194742053747,
      "kl": 0.00209065355011262,
      "learning_rate": 9.64437239462714e-07,
      "loss": 0.0001,
      "num_tokens": 105666992.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3840,
      "step_time": 14.258984357118607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 134.3125,
      "completions/mean_terminated_length": 134.3125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.31515443325042725,
      "epoch": 0.1779064381658175,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007335335481911898,
      "kl": 0.004559081397019327,
      "learning_rate": 9.644279759147752e-07,
      "loss": 0.0002,
      "num_tokens": 105689077.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3841,
      "step_time": 16.859371077269316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 184.75,
      "completions/mean_terminated_length": 184.75,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.1951061300933361,
      "epoch": 0.17795275590551182,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13436013460159302,
      "kl": 0.006344126304611564,
      "learning_rate": 9.644187123668366e-07,
      "loss": -0.0208,
      "num_tokens": 105747601.0,
      "reward": 0.9688286781311035,
      "reward_std": 0.06701634079217911,
      "rewards/reward_func/mean": 0.9688286781311035,
      "rewards/reward_func/std": 0.06701634079217911,
      "step": 3842,
      "step_time": 29.210843361914158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 173.6875,
      "completions/mean_terminated_length": 173.6875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.3495946228504181,
      "epoch": 0.17799907364520612,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009424271993339062,
      "kl": 0.006038697552867234,
      "learning_rate": 9.644094488188977e-07,
      "loss": 0.0003,
      "num_tokens": 105771196.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3843,
      "step_time": 17.927072402089834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 122.5,
      "completions/mean_terminated_length": 122.5,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.33498673886060715,
      "epoch": 0.17804539138490041,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005292301531881094,
      "kl": 0.0030770490411669016,
      "learning_rate": 9.644001852709588e-07,
      "loss": 0.0002,
      "num_tokens": 105798036.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3844,
      "step_time": 15.143288355320692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 175.5,
      "completions/mean_terminated_length": 175.5,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.13562552630901337,
      "epoch": 0.1780917091245947,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002276982879266143,
      "kl": 0.0014080112450756133,
      "learning_rate": 9.6439092172302e-07,
      "loss": 0.0001,
      "num_tokens": 105830460.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3845,
      "step_time": 20.715296521782875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 138.375,
      "completions/mean_terminated_length": 138.375,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.24147973209619522,
      "epoch": 0.17813802686428903,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003436050144955516,
      "kl": 0.0024765877751633525,
      "learning_rate": 9.64381658175081e-07,
      "loss": 0.0001,
      "num_tokens": 105850178.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3846,
      "step_time": 14.213184762746096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 130.0625,
      "completions/mean_terminated_length": 130.0625,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.27036910876631737,
      "epoch": 0.17818434460398333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004406011663377285,
      "kl": 0.002435957605484873,
      "learning_rate": 9.643723946271422e-07,
      "loss": 0.0001,
      "num_tokens": 105871475.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3847,
      "step_time": 14.044939454644918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3302168771624565,
      "epoch": 0.17823066234367763,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024486184120178223,
      "kl": 0.00217066629556939,
      "learning_rate": 9.643631310792033e-07,
      "loss": 0.0001,
      "num_tokens": 105903410.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3848,
      "step_time": 19.308651093393564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 341.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 238.375,
      "completions/mean_terminated_length": 238.375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.5138166099786758,
      "epoch": 0.17827698008337192,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12125244736671448,
      "kl": 0.007284591440111399,
      "learning_rate": 9.643538675312644e-07,
      "loss": 0.0377,
      "num_tokens": 105929720.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3849,
      "step_time": 28.40598590299487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 157.1875,
      "completions/mean_terminated_length": 157.1875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3992660269141197,
      "epoch": 0.17832329782306625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004069524817168713,
      "kl": 0.002859303029254079,
      "learning_rate": 9.643446039833256e-07,
      "loss": 0.0001,
      "num_tokens": 105962923.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3850,
      "step_time": 19.518219359219074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 234.125,
      "completions/mean_terminated_length": 234.125,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.2323986180126667,
      "epoch": 0.17836961556276054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002475576940923929,
      "kl": 0.001692907593678683,
      "learning_rate": 9.643353404353867e-07,
      "loss": 0.0001,
      "num_tokens": 105999181.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3851,
      "step_time": 24.847377281636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 156.375,
      "completions/mean_terminated_length": 156.375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.1864527016878128,
      "epoch": 0.17841593330245484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005519297439604998,
      "kl": 0.002675230032764375,
      "learning_rate": 9.643260768874478e-07,
      "loss": 0.0001,
      "num_tokens": 106026419.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 3852,
      "step_time": 17.847322486341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 164.1875,
      "completions/mean_terminated_length": 164.1875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.36013707518577576,
      "epoch": 0.17846225104214913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005480221938341856,
      "kl": 0.004316116275731474,
      "learning_rate": 9.64316813339509e-07,
      "loss": 0.0002,
      "num_tokens": 106047558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3853,
      "step_time": 16.41007661446929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 148.4375,
      "completions/mean_terminated_length": 148.4375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.24752887338399887,
      "epoch": 0.17850856878184346,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14251220226287842,
      "kl": 0.005275412113405764,
      "learning_rate": 9.6430754979157e-07,
      "loss": -0.0577,
      "num_tokens": 106069421.0,
      "reward": 0.9293943643569946,
      "reward_std": 0.035030219703912735,
      "rewards/reward_func/mean": 0.9293943643569946,
      "rewards/reward_func/std": 0.03503022342920303,
      "step": 3854,
      "step_time": 17.581236638128757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 159.3125,
      "completions/mean_terminated_length": 159.3125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.35773012042045593,
      "epoch": 0.17855488652153775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009938415139913559,
      "kl": 0.008871463127434254,
      "learning_rate": 9.642982862436314e-07,
      "loss": 0.0004,
      "num_tokens": 106093938.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3855,
      "step_time": 18.866581570357084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 132.6875,
      "completions/mean_terminated_length": 132.6875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.23759281262755394,
      "epoch": 0.17860120426123205,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034014505799859762,
      "kl": 0.0020460054656723514,
      "learning_rate": 9.642890226956925e-07,
      "loss": 0.0001,
      "num_tokens": 106114141.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3856,
      "step_time": 14.944848220795393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 127.625,
      "completions/mean_terminated_length": 127.625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.3074854612350464,
      "epoch": 0.17864752200092635,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004036040045320988,
      "kl": 0.0027269473066553473,
      "learning_rate": 9.642797591477534e-07,
      "loss": 0.0001,
      "num_tokens": 106135895.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3857,
      "step_time": 15.013610441237688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 186.0,
      "completions/mean_terminated_length": 186.0,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.4452502205967903,
      "epoch": 0.17869383974062067,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027183243073523045,
      "kl": 0.0027735581970773637,
      "learning_rate": 9.642704955998146e-07,
      "loss": 0.0001,
      "num_tokens": 106165703.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3858,
      "step_time": 20.652802895754576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 130.125,
      "completions/mean_terminated_length": 130.125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.20212046429514885,
      "epoch": 0.17874015748031497,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012862884439527988,
      "kl": 0.0025874590792227536,
      "learning_rate": 9.64261232051876e-07,
      "loss": 0.0001,
      "num_tokens": 106185305.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3859,
      "step_time": 13.375053532421589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 195.3125,
      "completions/mean_terminated_length": 195.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.37439778447151184,
      "epoch": 0.17878647522000926,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11001267284154892,
      "kl": 0.008875812869518995,
      "learning_rate": 9.64251968503937e-07,
      "loss": 0.0538,
      "num_tokens": 106207662.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3860,
      "step_time": 21.003232218325138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.26884597167372704,
      "epoch": 0.17883279295970356,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10725853592157364,
      "kl": 0.0035685841576196253,
      "learning_rate": 9.642427049559981e-07,
      "loss": 0.0107,
      "num_tokens": 106231493.0,
      "reward": 0.12200583517551422,
      "reward_std": 0.004130990710109472,
      "rewards/reward_func/mean": 0.12200583517551422,
      "rewards/reward_func/std": 0.004130990710109472,
      "step": 3861,
      "step_time": 18.338267598301172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 179.5,
      "completions/mean_terminated_length": 179.5,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.18928562477231026,
      "epoch": 0.17887911069939788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.016471249982714653,
      "kl": 0.024105083663016558,
      "learning_rate": 9.642334414080593e-07,
      "loss": 0.0012,
      "num_tokens": 106261037.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3862,
      "step_time": 20.131963800638914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 195.75,
      "completions/mean_terminated_length": 195.75,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3357459381222725,
      "epoch": 0.17892542843909218,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14837013185024261,
      "kl": 0.004607123322784901,
      "learning_rate": 9.642241778601204e-07,
      "loss": 0.0085,
      "num_tokens": 106293321.0,
      "reward": 0.4073399603366852,
      "reward_std": 0.4770277142524719,
      "rewards/reward_func/mean": 0.4073399603366852,
      "rewards/reward_func/std": 0.47702768445014954,
      "step": 3863,
      "step_time": 23.77440858259797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 168.0625,
      "completions/mean_terminated_length": 168.0625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.1621304452419281,
      "epoch": 0.17897174617878647,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033217810560017824,
      "kl": 0.002067015739157796,
      "learning_rate": 9.642149143121815e-07,
      "loss": 0.0001,
      "num_tokens": 106330346.0,
      "reward": 0.39511775970458984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.39511775970458984,
      "rewards/reward_func/std": 0.0,
      "step": 3864,
      "step_time": 20.833427734673023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 132.9375,
      "completions/mean_terminated_length": 132.9375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3487633988261223,
      "epoch": 0.17901806391848077,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002550463890656829,
      "kl": 0.002032527787378058,
      "learning_rate": 9.642056507642426e-07,
      "loss": 0.0001,
      "num_tokens": 106352617.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3865,
      "step_time": 14.310365248471498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 197.0625,
      "completions/mean_terminated_length": 197.0625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.322141170501709,
      "epoch": 0.1790643816581751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01066620834171772,
      "kl": 0.010315066552720964,
      "learning_rate": 9.641963872163038e-07,
      "loss": 0.0005,
      "num_tokens": 106374698.0,
      "reward": 0.24659696221351624,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.24659696221351624,
      "rewards/reward_func/std": 0.0,
      "step": 3866,
      "step_time": 20.995813205838203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 203.75,
      "completions/mean_terminated_length": 203.75,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4266778379678726,
      "epoch": 0.1791106993978694,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006910845171660185,
      "kl": 0.005342388059943914,
      "learning_rate": 9.64187123668365e-07,
      "loss": 0.0003,
      "num_tokens": 106412214.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3867,
      "step_time": 28.35900116711855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 117.375,
      "completions/mean_terminated_length": 117.375,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 0.24351060763001442,
      "epoch": 0.17915701713756368,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003948332276195288,
      "kl": 0.0018374093051534146,
      "learning_rate": 9.64177860120426e-07,
      "loss": 0.0001,
      "num_tokens": 106431484.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3868,
      "step_time": 13.185180019587278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 139.5625,
      "completions/mean_terminated_length": 139.5625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.18132193014025688,
      "epoch": 0.17920333487725798,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12919850647449493,
      "kl": 0.003018807154148817,
      "learning_rate": 9.641685965724871e-07,
      "loss": -0.04,
      "num_tokens": 106452213.0,
      "reward": 0.0763099268078804,
      "reward_std": 0.22499586641788483,
      "rewards/reward_func/mean": 0.0763099268078804,
      "rewards/reward_func/std": 0.22499586641788483,
      "step": 3869,
      "step_time": 15.816828023642302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 154.375,
      "completions/mean_terminated_length": 154.375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.32873932272195816,
      "epoch": 0.1792496526169523,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.016376294195652008,
      "kl": 0.012700323015451431,
      "learning_rate": 9.641593330245483e-07,
      "loss": 0.0006,
      "num_tokens": 106476683.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3870,
      "step_time": 18.558602813631296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 121.875,
      "completions/mean_terminated_length": 121.875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2800655737519264,
      "epoch": 0.1792959703566466,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018830408807843924,
      "kl": 0.0016500942001584917,
      "learning_rate": 9.641500694766094e-07,
      "loss": 0.0001,
      "num_tokens": 106496441.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3871,
      "step_time": 13.768268425017595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.1875,
      "completions/mean_terminated_length": 132.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2514149695634842,
      "epoch": 0.1793422880963409,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00770987942814827,
      "kl": 0.003763327025808394,
      "learning_rate": 9.641408059286707e-07,
      "loss": 0.0002,
      "num_tokens": 106517340.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3872,
      "step_time": 14.451893799006939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.47073114663362503,
      "epoch": 0.1793886058360352,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025714649818837643,
      "kl": 0.0025621079839766026,
      "learning_rate": 9.641315423807319e-07,
      "loss": 0.0001,
      "num_tokens": 106548475.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3873,
      "step_time": 19.580643940716982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 130.4375,
      "completions/mean_terminated_length": 130.4375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2749612033367157,
      "epoch": 0.17943492357572952,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00378880905918777,
      "kl": 0.002403245773166418,
      "learning_rate": 9.64122278832793e-07,
      "loss": 0.0001,
      "num_tokens": 106572802.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3874,
      "step_time": 15.559952519834042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 189.0625,
      "completions/mean_terminated_length": 189.0625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.17167653515934944,
      "epoch": 0.1794812413154238,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002166612772271037,
      "kl": 0.001824690873036161,
      "learning_rate": 9.64113015284854e-07,
      "loss": 0.0001,
      "num_tokens": 106610067.0,
      "reward": 0.8970773816108704,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8970773816108704,
      "rewards/reward_func/std": 0.0,
      "step": 3875,
      "step_time": 24.00137247145176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 201.25,
      "completions/mean_terminated_length": 201.25,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.2034187652170658,
      "epoch": 0.1795275590551181,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004419104196131229,
      "kl": 0.004266760079190135,
      "learning_rate": 9.641037517369152e-07,
      "loss": 0.0002,
      "num_tokens": 106646567.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3876,
      "step_time": 25.514030795544386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 181.625,
      "completions/mean_terminated_length": 181.625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.2660621926188469,
      "epoch": 0.1795738767948124,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11790023744106293,
      "kl": 0.025686467299237847,
      "learning_rate": 9.640944881889764e-07,
      "loss": -0.0725,
      "num_tokens": 106681793.0,
      "reward": 0.7919888496398926,
      "reward_std": 0.3820759356021881,
      "rewards/reward_func/mean": 0.7919888496398926,
      "rewards/reward_func/std": 0.3820759654045105,
      "step": 3877,
      "step_time": 23.93250075355172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 302.5625,
      "completions/mean_terminated_length": 302.5625,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "entropy": 0.2444363832473755,
      "epoch": 0.17962019453450673,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004091878887265921,
      "kl": 0.0036927127512171865,
      "learning_rate": 9.640852246410375e-07,
      "loss": 0.0002,
      "num_tokens": 106709322.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3878,
      "step_time": 27.31113762408495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 173.25,
      "completions/mean_terminated_length": 173.25,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.44217732548713684,
      "epoch": 0.17966651227420102,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16478723287582397,
      "kl": 0.008424879633821547,
      "learning_rate": 9.640759610930986e-07,
      "loss": -0.0482,
      "num_tokens": 106742558.0,
      "reward": 0.05278949812054634,
      "reward_std": 0.21115797758102417,
      "rewards/reward_func/mean": 0.05278949812054634,
      "rewards/reward_func/std": 0.21115799248218536,
      "step": 3879,
      "step_time": 22.040966276079416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 172.9375,
      "completions/mean_terminated_length": 172.9375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.2926242798566818,
      "epoch": 0.17971283001389532,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13188034296035767,
      "kl": 0.004058061807882041,
      "learning_rate": 9.640666975451597e-07,
      "loss": -0.0057,
      "num_tokens": 106779709.0,
      "reward": 0.4177449345588684,
      "reward_std": 0.11502572894096375,
      "rewards/reward_func/mean": 0.4177449345588684,
      "rewards/reward_func/std": 0.11502573639154434,
      "step": 3880,
      "step_time": 21.731445774435997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 125.3125,
      "completions/mean_terminated_length": 125.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2763097584247589,
      "epoch": 0.17975914775358962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003961433190852404,
      "kl": 0.0024688647827133536,
      "learning_rate": 9.640574339972209e-07,
      "loss": 0.0001,
      "num_tokens": 106802610.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3881,
      "step_time": 14.127445373684168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 202.5625,
      "completions/mean_terminated_length": 202.5625,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.23512405157089233,
      "epoch": 0.17980546549328394,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001926533761434257,
      "kl": 0.002032506628893316,
      "learning_rate": 9.64048170449282e-07,
      "loss": 0.0001,
      "num_tokens": 106838955.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3882,
      "step_time": 23.26013696938753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 129.5625,
      "completions/mean_terminated_length": 129.5625,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2327059954404831,
      "epoch": 0.17985178323297824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002668160479515791,
      "kl": 0.0015750114107504487,
      "learning_rate": 9.640389069013431e-07,
      "loss": 0.0001,
      "num_tokens": 106859700.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3883,
      "step_time": 13.915167830884457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 120.0,
      "completions/max_terminated_length": 120.0,
      "completions/mean_length": 104.0625,
      "completions/mean_terminated_length": 104.0625,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 0.33782267570495605,
      "epoch": 0.17989810097267253,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005571055691689253,
      "kl": 0.002933368261437863,
      "learning_rate": 9.640296433534042e-07,
      "loss": 0.0001,
      "num_tokens": 106880085.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3884,
      "step_time": 12.020050313323736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 243.9375,
      "completions/mean_terminated_length": 243.9375,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.2659078761935234,
      "epoch": 0.17994441871236683,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036406246945261955,
      "kl": 0.006943120853975415,
      "learning_rate": 9.640203798054656e-07,
      "loss": 0.0003,
      "num_tokens": 106908708.0,
      "reward": 0.740818202495575,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.740818202495575,
      "rewards/reward_func/std": 0.0,
      "step": 3885,
      "step_time": 25.21360557153821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 154.1875,
      "completions/mean_terminated_length": 154.1875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.24740813672542572,
      "epoch": 0.17999073645206115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004300672560930252,
      "kl": 0.0032012086594477296,
      "learning_rate": 9.640111162575267e-07,
      "loss": 0.0002,
      "num_tokens": 106928487.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3886,
      "step_time": 16.44284963980317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 152.6875,
      "completions/mean_terminated_length": 152.6875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.20444755628705025,
      "epoch": 0.18003705419175545,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11522559076547623,
      "kl": 0.06768784299492836,
      "learning_rate": 9.640018527095878e-07,
      "loss": -0.0715,
      "num_tokens": 106950498.0,
      "reward": 0.6712665557861328,
      "reward_std": 0.21674343943595886,
      "rewards/reward_func/mean": 0.6712665557861328,
      "rewards/reward_func/std": 0.21674346923828125,
      "step": 3887,
      "step_time": 20.053553327918053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 146.25,
      "completions/mean_terminated_length": 146.25,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.35008594393730164,
      "epoch": 0.18008337193144974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006665939465165138,
      "kl": 0.004400743579026312,
      "learning_rate": 9.639925891616487e-07,
      "loss": 0.0002,
      "num_tokens": 106973222.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3888,
      "step_time": 15.803185060620308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 250.1875,
      "completions/mean_terminated_length": 250.1875,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "entropy": 0.19690639898180962,
      "epoch": 0.18012968967114404,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09986148774623871,
      "kl": 0.011436095228418708,
      "learning_rate": 9.6398332561371e-07,
      "loss": -0.0021,
      "num_tokens": 107003321.0,
      "reward": 0.8035168647766113,
      "reward_std": 0.13681329786777496,
      "rewards/reward_func/mean": 0.8035168647766113,
      "rewards/reward_func/std": 0.13681329786777496,
      "step": 3889,
      "step_time": 24.08195485919714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 118.75,
      "completions/mean_terminated_length": 118.75,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.23816321045160294,
      "epoch": 0.18017600741083836,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004808017518371344,
      "kl": 0.0027467715553939342,
      "learning_rate": 9.639740620657712e-07,
      "loss": 0.0001,
      "num_tokens": 107023221.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3890,
      "step_time": 14.073711056262255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 171.125,
      "completions/mean_terminated_length": 171.125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2762995511293411,
      "epoch": 0.18022232515053266,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10893756151199341,
      "kl": 0.005113076651468873,
      "learning_rate": 9.639647985178323e-07,
      "loss": -0.0393,
      "num_tokens": 107044679.0,
      "reward": 0.8529131412506104,
      "reward_std": 0.07281851023435593,
      "rewards/reward_func/mean": 0.8529131412506104,
      "rewards/reward_func/std": 0.07281851023435593,
      "step": 3891,
      "step_time": 18.657626669853926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 204.375,
      "completions/mean_terminated_length": 204.375,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.14512208476662636,
      "epoch": 0.18026864289022695,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08163206279277802,
      "kl": 0.005026340892072767,
      "learning_rate": 9.639555349698934e-07,
      "loss": -0.0171,
      "num_tokens": 107067197.0,
      "reward": 0.990176796913147,
      "reward_std": 0.017572296783328056,
      "rewards/reward_func/mean": 0.990176796913147,
      "rewards/reward_func/std": 0.017572306096553802,
      "step": 3892,
      "step_time": 19.930472690612078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 151.875,
      "completions/mean_terminated_length": 151.875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.34035804867744446,
      "epoch": 0.18031496062992125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006593564059585333,
      "kl": 0.0030465046875178814,
      "learning_rate": 9.639462714219546e-07,
      "loss": 0.0002,
      "num_tokens": 107096891.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3893,
      "step_time": 17.54202764481306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 222.9375,
      "completions/mean_terminated_length": 222.9375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.19629066810011864,
      "epoch": 0.18036127836961557,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010079527273774147,
      "kl": 0.07693048194050789,
      "learning_rate": 9.639370078740157e-07,
      "loss": 0.0038,
      "num_tokens": 107123210.0,
      "reward": 0.8542837500572205,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8542837500572205,
      "rewards/reward_func/std": 0.0,
      "step": 3894,
      "step_time": 22.091420751065016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 208.25,
      "completions/mean_terminated_length": 208.25,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.25358671322464943,
      "epoch": 0.18040759610930987,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10642461478710175,
      "kl": 0.006079294253140688,
      "learning_rate": 9.639277443260768e-07,
      "loss": -0.0101,
      "num_tokens": 107157630.0,
      "reward": 0.9601125717163086,
      "reward_std": 0.010636663064360619,
      "rewards/reward_func/mean": 0.9601125717163086,
      "rewards/reward_func/std": 0.010636658407747746,
      "step": 3895,
      "step_time": 25.84510939568281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 135.1875,
      "completions/mean_terminated_length": 135.1875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.31263020634651184,
      "epoch": 0.18045391384900417,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028968616388738155,
      "kl": 0.0021427946048788726,
      "learning_rate": 9.63918480778138e-07,
      "loss": 0.0001,
      "num_tokens": 107178769.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3896,
      "step_time": 14.797037236392498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 124.3125,
      "completions/mean_terminated_length": 124.3125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.273935966193676,
      "epoch": 0.18050023158869846,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004854854661971331,
      "kl": 0.0026416799519211054,
      "learning_rate": 9.63909217230199e-07,
      "loss": 0.0001,
      "num_tokens": 107200022.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3897,
      "step_time": 14.565889682620764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 176.25,
      "completions/mean_terminated_length": 176.25,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.41036058217287064,
      "epoch": 0.18054654932839279,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0051071178168058395,
      "kl": 0.004573448677547276,
      "learning_rate": 9.638999536822602e-07,
      "loss": 0.0002,
      "num_tokens": 107231082.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3898,
      "step_time": 23.072204168885946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 130.875,
      "completions/mean_terminated_length": 130.875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.28761573135852814,
      "epoch": 0.18059286706808708,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029935636557638645,
      "kl": 0.0024007526808418334,
      "learning_rate": 9.638906901343215e-07,
      "loss": 0.0001,
      "num_tokens": 107254600.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3899,
      "step_time": 14.77840093523264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 165.9375,
      "completions/mean_terminated_length": 165.9375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2848445922136307,
      "epoch": 0.18063918480778138,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12299530953168869,
      "kl": 0.025952158961445093,
      "learning_rate": 9.638814265863824e-07,
      "loss": 0.0335,
      "num_tokens": 107276135.0,
      "reward": 0.8355741500854492,
      "reward_std": 0.056173864752054214,
      "rewards/reward_func/mean": 0.8355741500854492,
      "rewards/reward_func/std": 0.05617387965321541,
      "step": 3900,
      "step_time": 17.51611840352416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 158.0625,
      "completions/mean_terminated_length": 158.0625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2537331245839596,
      "epoch": 0.18068550254747567,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023763119243085384,
      "kl": 0.0018065246695186943,
      "learning_rate": 9.638721630384436e-07,
      "loss": 0.0001,
      "num_tokens": 107305352.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3901,
      "step_time": 18.49169960990548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 175.4375,
      "completions/mean_terminated_length": 175.4375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.311426617205143,
      "epoch": 0.18073182028717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038535951171070337,
      "kl": 0.002921569102909416,
      "learning_rate": 9.63862899490505e-07,
      "loss": 0.0001,
      "num_tokens": 107330671.0,
      "reward": 0.3678794503211975,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3678794503211975,
      "rewards/reward_func/std": 0.0,
      "step": 3902,
      "step_time": 19.04279673472047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 161.1875,
      "completions/mean_terminated_length": 161.1875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.37105194479227066,
      "epoch": 0.1807781380268643,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025385727640241385,
      "kl": 0.0021450287313200533,
      "learning_rate": 9.63853635942566e-07,
      "loss": 0.0001,
      "num_tokens": 107359426.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3903,
      "step_time": 18.226160261780024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 126.625,
      "completions/mean_terminated_length": 126.625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3110702708363533,
      "epoch": 0.1808244557665586,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002151668770238757,
      "kl": 0.0017418436182197183,
      "learning_rate": 9.638443723946272e-07,
      "loss": 0.0001,
      "num_tokens": 107395036.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3904,
      "step_time": 17.367747947573662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 284.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 259.625,
      "completions/mean_terminated_length": 259.625,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "entropy": 0.14336872100830078,
      "epoch": 0.18087077350625289,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020849378779530525,
      "kl": 0.001666792668402195,
      "learning_rate": 9.638351088466883e-07,
      "loss": 0.0001,
      "num_tokens": 107425334.0,
      "reward": 0.9622687101364136,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9622687101364136,
      "rewards/reward_func/std": 0.0,
      "step": 3905,
      "step_time": 25.53689743205905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 234.0,
      "completions/mean_terminated_length": 234.0,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.455336295068264,
      "epoch": 0.1809170912459472,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0899076759815216,
      "kl": 0.00596992252394557,
      "learning_rate": 9.638258452987494e-07,
      "loss": -0.0499,
      "num_tokens": 107461126.0,
      "reward": 0.43209439516067505,
      "reward_std": 0.4501776397228241,
      "rewards/reward_func/mean": 0.43209439516067505,
      "rewards/reward_func/std": 0.4501776397228241,
      "step": 3906,
      "step_time": 28.96193305402994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 107.5625,
      "completions/mean_terminated_length": 107.5625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.25259115546941757,
      "epoch": 0.1809634089856415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004304500296711922,
      "kl": 0.002399189746938646,
      "learning_rate": 9.638165817508105e-07,
      "loss": 0.0001,
      "num_tokens": 107481247.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3907,
      "step_time": 13.515427503734827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 193.8125,
      "completions/mean_terminated_length": 193.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4437813311815262,
      "epoch": 0.1810097267253358,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008852921426296234,
      "kl": 0.0069910825695842505,
      "learning_rate": 9.638073182028717e-07,
      "loss": 0.0003,
      "num_tokens": 107506556.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3908,
      "step_time": 20.957769952714443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 163.0,
      "completions/mean_terminated_length": 163.0,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.20349428057670593,
      "epoch": 0.1810560444650301,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017322602216154337,
      "kl": 0.004391728434711695,
      "learning_rate": 9.637980546549328e-07,
      "loss": 0.0002,
      "num_tokens": 107529180.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 3909,
      "step_time": 17.231120854616165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 136.5625,
      "completions/mean_terminated_length": 136.5625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3487413823604584,
      "epoch": 0.18110236220472442,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004991558846086264,
      "kl": 0.0034653438488021493,
      "learning_rate": 9.63788791106994e-07,
      "loss": 0.0002,
      "num_tokens": 107565125.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3910,
      "step_time": 17.688213545829058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 309.0,
      "completions/max_terminated_length": 309.0,
      "completions/mean_length": 284.4375,
      "completions/mean_terminated_length": 284.4375,
      "completions/min_length": 253.0,
      "completions/min_terminated_length": 253.0,
      "entropy": 0.18257901072502136,
      "epoch": 0.18114867994441872,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003655927488580346,
      "kl": 0.003254766226746142,
      "learning_rate": 9.63779527559055e-07,
      "loss": 0.0002,
      "num_tokens": 107598460.0,
      "reward": 0.8920138478279114,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8920138478279114,
      "rewards/reward_func/std": 0.0,
      "step": 3911,
      "step_time": 27.935412857681513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 120.375,
      "completions/mean_terminated_length": 120.375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.20604503899812698,
      "epoch": 0.181194997684113,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031442560721188784,
      "kl": 0.00205564804491587,
      "learning_rate": 9.637702640111162e-07,
      "loss": 0.0001,
      "num_tokens": 107618178.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3912,
      "step_time": 13.054497182369232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 366.0,
      "completions/max_terminated_length": 366.0,
      "completions/mean_length": 282.25,
      "completions/mean_terminated_length": 282.25,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "entropy": 0.30461709946393967,
      "epoch": 0.1812413154238073,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07777665555477142,
      "kl": 0.005089417682029307,
      "learning_rate": 9.637610004631773e-07,
      "loss": -0.0683,
      "num_tokens": 107658390.0,
      "reward": 0.3690711259841919,
      "reward_std": 0.15417903661727905,
      "rewards/reward_func/mean": 0.3690711259841919,
      "rewards/reward_func/std": 0.15417903661727905,
      "step": 3913,
      "step_time": 33.33389285206795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 137.3125,
      "completions/mean_terminated_length": 137.3125,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.27665771543979645,
      "epoch": 0.18128763316350163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02194824256002903,
      "kl": 0.007084732060320675,
      "learning_rate": 9.637517369152384e-07,
      "loss": 0.0004,
      "num_tokens": 107678027.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3914,
      "step_time": 14.059442419558764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 162.125,
      "completions/mean_terminated_length": 162.125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.17007062211632729,
      "epoch": 0.18133395090319593,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005854338873177767,
      "kl": 0.004630892304703593,
      "learning_rate": 9.637424733672997e-07,
      "loss": 0.0002,
      "num_tokens": 107699773.0,
      "reward": 0.9574533700942993,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9574533700942993,
      "rewards/reward_func/std": 0.0,
      "step": 3915,
      "step_time": 17.851514037698507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.25144483149051666,
      "epoch": 0.18138026864289022,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007408967707306147,
      "kl": 0.006645939662121236,
      "learning_rate": 9.637332098193609e-07,
      "loss": 0.0003,
      "num_tokens": 107722841.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3916,
      "step_time": 18.072697196155787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 154.125,
      "completions/mean_terminated_length": 154.125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.15302323177456856,
      "epoch": 0.18142658638258452,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015116202645003796,
      "kl": 0.008419914287514985,
      "learning_rate": 9.63723946271422e-07,
      "loss": 0.0004,
      "num_tokens": 107749499.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3917,
      "step_time": 18.053609509021044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 192.1875,
      "completions/mean_terminated_length": 192.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.3846072405576706,
      "epoch": 0.18147290412227884,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010995019227266312,
      "kl": 0.00756355409976095,
      "learning_rate": 9.63714682723483e-07,
      "loss": 0.0004,
      "num_tokens": 107773678.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3918,
      "step_time": 20.119005125015974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 226.0,
      "completions/mean_terminated_length": 226.0,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.40393929183483124,
      "epoch": 0.18151922186197314,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09263960272073746,
      "kl": 0.0070364398416131735,
      "learning_rate": 9.637054191755442e-07,
      "loss": -0.0306,
      "num_tokens": 107806110.0,
      "reward": 0.7071548700332642,
      "reward_std": 0.42166566848754883,
      "rewards/reward_func/mean": 0.7071548700332642,
      "rewards/reward_func/std": 0.4216656982898712,
      "step": 3919,
      "step_time": 26.11157711967826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 150.375,
      "completions/mean_terminated_length": 150.375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3409520164132118,
      "epoch": 0.18156553960166744,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002416732022538781,
      "kl": 0.0018317789945285767,
      "learning_rate": 9.636961556276054e-07,
      "loss": 0.0001,
      "num_tokens": 107841892.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3920,
      "step_time": 18.842884566634893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 167.875,
      "completions/mean_terminated_length": 167.875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.18287443369627,
      "epoch": 0.18161185734136173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005541726481169462,
      "kl": 0.013160837115719914,
      "learning_rate": 9.636868920796665e-07,
      "loss": 0.0007,
      "num_tokens": 107875090.0,
      "reward": 0.23457029461860657,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.23457029461860657,
      "rewards/reward_func/std": 0.0,
      "step": 3921,
      "step_time": 19.289226531982422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 179.9375,
      "completions/mean_terminated_length": 179.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.15644804388284683,
      "epoch": 0.18165817508105606,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09397299587726593,
      "kl": 0.0019102707447018474,
      "learning_rate": 9.636776285317276e-07,
      "loss": 0.0415,
      "num_tokens": 107903361.0,
      "reward": 0.8961285352706909,
      "reward_std": 0.019452031701803207,
      "rewards/reward_func/mean": 0.8961285352706909,
      "rewards/reward_func/std": 0.019452018663287163,
      "step": 3922,
      "step_time": 19.803660698235035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 242.6875,
      "completions/mean_terminated_length": 242.6875,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.2391553744673729,
      "epoch": 0.18170449282075035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13449817895889282,
      "kl": 0.00538853625766933,
      "learning_rate": 9.636683649837887e-07,
      "loss": -0.0743,
      "num_tokens": 107926412.0,
      "reward": 0.5545538663864136,
      "reward_std": 0.09470558166503906,
      "rewards/reward_func/mean": 0.5545538663864136,
      "rewards/reward_func/std": 0.09470557421445847,
      "step": 3923,
      "step_time": 26.411487139761448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 350.0,
      "completions/max_terminated_length": 350.0,
      "completions/mean_length": 201.1875,
      "completions/mean_terminated_length": 201.1875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.37411340326070786,
      "epoch": 0.18175081056044465,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10074204206466675,
      "kl": 0.004191602231003344,
      "learning_rate": 9.636591014358499e-07,
      "loss": 0.1088,
      "num_tokens": 107948127.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 3924,
      "step_time": 27.94429662078619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.4033663347363472,
      "epoch": 0.18179712830013894,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025540878996253014,
      "kl": 0.002043678832706064,
      "learning_rate": 9.63649837887911e-07,
      "loss": 0.0001,
      "num_tokens": 107978665.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3925,
      "step_time": 18.012788832187653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 175.5,
      "completions/mean_terminated_length": 175.5,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.41431621462106705,
      "epoch": 0.18184344603983327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029174371156841516,
      "kl": 0.002668401808477938,
      "learning_rate": 9.636405743399721e-07,
      "loss": 0.0001,
      "num_tokens": 108012897.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3926,
      "step_time": 21.906016305088997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 145.3125,
      "completions/mean_terminated_length": 145.3125,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.24583029001951218,
      "epoch": 0.18188976377952756,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.017002010717988014,
      "kl": 0.008719731937162578,
      "learning_rate": 9.636313107920332e-07,
      "loss": 0.0004,
      "num_tokens": 108033270.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3927,
      "step_time": 15.884957481175661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 128.625,
      "completions/mean_terminated_length": 128.625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.32335077971220016,
      "epoch": 0.18193608151922186,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007770972326397896,
      "kl": 0.004017679311800748,
      "learning_rate": 9.636220472440944e-07,
      "loss": 0.0002,
      "num_tokens": 108054960.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3928,
      "step_time": 14.316983543336391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 140.125,
      "completions/mean_terminated_length": 140.125,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.2963094189763069,
      "epoch": 0.18198239925891616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003596608992666006,
      "kl": 0.002422891091555357,
      "learning_rate": 9.636127836961557e-07,
      "loss": 0.0001,
      "num_tokens": 108078562.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3929,
      "step_time": 16.01212800294161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 202.4375,
      "completions/mean_terminated_length": 202.4375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.34484798461198807,
      "epoch": 0.18202871699861048,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01932627521455288,
      "kl": 0.013576515251770616,
      "learning_rate": 9.636035201482168e-07,
      "loss": 0.0007,
      "num_tokens": 108107961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3930,
      "step_time": 21.849462278187275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 131.5,
      "completions/mean_terminated_length": 131.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2778580114245415,
      "epoch": 0.18207503473830478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022489915136247873,
      "kl": 0.001488033216446638,
      "learning_rate": 9.635942566002777e-07,
      "loss": 0.0001,
      "num_tokens": 108129617.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3931,
      "step_time": 14.768348969519138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 191.0625,
      "completions/mean_terminated_length": 191.0625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.17658758535981178,
      "epoch": 0.18212135247799907,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10897427052259445,
      "kl": 0.004015539278043434,
      "learning_rate": 9.63584993052339e-07,
      "loss": -0.0634,
      "num_tokens": 108157170.0,
      "reward": 0.8715384006500244,
      "reward_std": 0.22949376702308655,
      "rewards/reward_func/mean": 0.8715384006500244,
      "rewards/reward_func/std": 0.22949376702308655,
      "step": 3932,
      "step_time": 22.75570983439684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 194.0625,
      "completions/mean_terminated_length": 194.0625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.43365979939699173,
      "epoch": 0.18216767021769337,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004585118032991886,
      "kl": 0.0038146452279761434,
      "learning_rate": 9.635757295044002e-07,
      "loss": 0.0002,
      "num_tokens": 108190739.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3933,
      "step_time": 25.706376645714045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 147.875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.23387249931693077,
      "epoch": 0.1822139879573877,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00591407623142004,
      "kl": 0.005576587747782469,
      "learning_rate": 9.635664659564613e-07,
      "loss": 0.0003,
      "num_tokens": 108210977.0,
      "reward": 0.8070557117462158,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8070557117462158,
      "rewards/reward_func/std": 0.0,
      "step": 3934,
      "step_time": 15.148271512240171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 184.25,
      "completions/mean_terminated_length": 184.25,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.4190472513437271,
      "epoch": 0.182260305697082,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007171202916651964,
      "kl": 0.003614607499912381,
      "learning_rate": 9.635572024085225e-07,
      "loss": 0.0002,
      "num_tokens": 108246469.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3935,
      "step_time": 22.09712702408433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 314.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 202.0,
      "completions/mean_terminated_length": 202.0,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.38352401554584503,
      "epoch": 0.18230662343677628,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12347492575645447,
      "kl": 0.005324090830981731,
      "learning_rate": 9.635479388605836e-07,
      "loss": 0.0739,
      "num_tokens": 108274357.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 3936,
      "step_time": 26.792213916778564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 144.125,
      "completions/mean_terminated_length": 144.125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.34267380833625793,
      "epoch": 0.18235294117647058,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002206089673563838,
      "kl": 0.0022476864396594465,
      "learning_rate": 9.635386753126447e-07,
      "loss": 0.0001,
      "num_tokens": 108326567.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3937,
      "step_time": 23.410325340926647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 193.875,
      "completions/mean_terminated_length": 193.875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.23739049211144447,
      "epoch": 0.1823992589161649,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09990550577640533,
      "kl": 0.0032598910038359463,
      "learning_rate": 9.635294117647058e-07,
      "loss": 0.1541,
      "num_tokens": 108347877.0,
      "reward": 0.7684125304222107,
      "reward_std": 0.23634617030620575,
      "rewards/reward_func/mean": 0.7684125304222107,
      "rewards/reward_func/std": 0.23634617030620575,
      "step": 3938,
      "step_time": 23.93851326778531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 336.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 244.375,
      "completions/mean_terminated_length": 244.375,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.49285295605659485,
      "epoch": 0.1824455766558592,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0933506041765213,
      "kl": 0.006029486772604287,
      "learning_rate": 9.63520148216767e-07,
      "loss": 0.02,
      "num_tokens": 108371755.0,
      "reward": 0.3125,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.3125,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 3939,
      "step_time": 27.82623726502061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 132.625,
      "completions/mean_terminated_length": 132.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.28407369554042816,
      "epoch": 0.1824918943955535,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006024600937962532,
      "kl": 0.0027144767227582633,
      "learning_rate": 9.63510884668828e-07,
      "loss": 0.0001,
      "num_tokens": 108393109.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3940,
      "step_time": 16.098177798092365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 133.75,
      "completions/mean_terminated_length": 133.75,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2104021981358528,
      "epoch": 0.1825382121352478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003316520946100354,
      "kl": 0.0015981484903022647,
      "learning_rate": 9.635016211208892e-07,
      "loss": 0.0001,
      "num_tokens": 108412769.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3941,
      "step_time": 13.816600162535906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 305.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 201.9375,
      "completions/mean_terminated_length": 201.9375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.41445229202508926,
      "epoch": 0.18258452987494211,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11863738298416138,
      "kl": 0.007025412167422473,
      "learning_rate": 9.634923575729505e-07,
      "loss": 0.1255,
      "num_tokens": 108444912.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 3942,
      "step_time": 27.154757909476757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 114.3125,
      "completions/mean_terminated_length": 114.3125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2739161103963852,
      "epoch": 0.1826308476146364,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024053254164755344,
      "kl": 0.0017553191573824733,
      "learning_rate": 9.634830940250114e-07,
      "loss": 0.0001,
      "num_tokens": 108465525.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3943,
      "step_time": 12.522561896592379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 203.6875,
      "completions/mean_terminated_length": 203.6875,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3565376326441765,
      "epoch": 0.1826771653543307,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012317704036831856,
      "kl": 0.007949843886308372,
      "learning_rate": 9.634738304770726e-07,
      "loss": 0.0004,
      "num_tokens": 108492976.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3944,
      "step_time": 23.16466485336423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 226.25,
      "completions/mean_terminated_length": 226.25,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "entropy": 0.22141293436288834,
      "epoch": 0.182723483094025,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00474140141159296,
      "kl": 0.004121715668588877,
      "learning_rate": 9.634645669291337e-07,
      "loss": 0.0002,
      "num_tokens": 108515732.0,
      "reward": 0.7206611633300781,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7206611633300781,
      "rewards/reward_func/std": 0.0,
      "step": 3945,
      "step_time": 21.747199185192585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 187.6875,
      "completions/mean_terminated_length": 187.6875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.1961042806506157,
      "epoch": 0.18276980083371933,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009400580078363419,
      "kl": 0.005008580919820815,
      "learning_rate": 9.63455303381195e-07,
      "loss": 0.0002,
      "num_tokens": 108540591.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 3946,
      "step_time": 19.772193666547537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 182.375,
      "completions/mean_terminated_length": 182.375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.3062416911125183,
      "epoch": 0.18281611857341362,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12036958336830139,
      "kl": 0.00928862695582211,
      "learning_rate": 9.634460398332562e-07,
      "loss": 0.0938,
      "num_tokens": 108564645.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 3947,
      "step_time": 21.514170866459608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 138.5625,
      "completions/mean_terminated_length": 138.5625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3367481976747513,
      "epoch": 0.18286243631310792,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019740720745176077,
      "kl": 0.0016995065961964428,
      "learning_rate": 9.634367762853173e-07,
      "loss": 0.0001,
      "num_tokens": 108598174.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3948,
      "step_time": 18.64675521478057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.21268099546432495,
      "epoch": 0.1829087540528022,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09837204217910767,
      "kl": 0.0037456040736287832,
      "learning_rate": 9.634275127373784e-07,
      "loss": 0.0125,
      "num_tokens": 108626387.0,
      "reward": 0.990892767906189,
      "reward_std": 0.03642905503511429,
      "rewards/reward_func/mean": 0.990892767906189,
      "rewards/reward_func/std": 0.036429062485694885,
      "step": 3949,
      "step_time": 19.300760619342327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 169.4375,
      "completions/mean_terminated_length": 169.4375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3538523018360138,
      "epoch": 0.18295507179249654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003690734039992094,
      "kl": 0.0029123123385943472,
      "learning_rate": 9.634182491894395e-07,
      "loss": 0.0001,
      "num_tokens": 108661610.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3950,
      "step_time": 20.566821537911892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.35938917845487595,
      "epoch": 0.18300138953219083,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10962846130132675,
      "kl": 0.006495502311736345,
      "learning_rate": 9.634089856415007e-07,
      "loss": -0.0847,
      "num_tokens": 108699452.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 3951,
      "step_time": 25.562778376042843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 173.0,
      "completions/mean_terminated_length": 173.0,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3935577720403671,
      "epoch": 0.18304770727188513,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006419269368052483,
      "kl": 0.004049824550747871,
      "learning_rate": 9.633997220935618e-07,
      "loss": 0.0002,
      "num_tokens": 108729420.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3952,
      "step_time": 19.729410778731108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 124.5625,
      "completions/mean_terminated_length": 124.5625,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.1442272663116455,
      "epoch": 0.18309402501157943,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00809891615062952,
      "kl": 0.0035606708261184394,
      "learning_rate": 9.63390458545623e-07,
      "loss": 0.0002,
      "num_tokens": 108749749.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 3953,
      "step_time": 14.20975099503994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 194.4375,
      "completions/mean_terminated_length": 194.4375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.34076155722141266,
      "epoch": 0.18314034275127375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029215000104159117,
      "kl": 0.0025104266533162445,
      "learning_rate": 9.63381194997684e-07,
      "loss": 0.0001,
      "num_tokens": 108777292.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3954,
      "step_time": 20.822266440838575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 196.8125,
      "completions/mean_terminated_length": 196.8125,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.37988148629665375,
      "epoch": 0.18318666049096805,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10158326476812363,
      "kl": 0.011375895468518138,
      "learning_rate": 9.633719314497452e-07,
      "loss": 0.0732,
      "num_tokens": 108801769.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 3955,
      "step_time": 22.12293777242303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 174.25,
      "completions/mean_terminated_length": 174.25,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.1992478109896183,
      "epoch": 0.18323297823066234,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00272434507496655,
      "kl": 0.002358984900638461,
      "learning_rate": 9.633626679018063e-07,
      "loss": 0.0001,
      "num_tokens": 108831309.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 3956,
      "step_time": 20.238527458161116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 148.4375,
      "completions/mean_terminated_length": 148.4375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.34961116313934326,
      "epoch": 0.18327929597035664,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012762676924467087,
      "kl": 0.004101846134290099,
      "learning_rate": 9.633534043538674e-07,
      "loss": 0.0002,
      "num_tokens": 108855348.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3957,
      "step_time": 16.884622506797314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 161.625,
      "completions/mean_terminated_length": 161.625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.2958749383687973,
      "epoch": 0.18332561371005096,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004096558783203363,
      "kl": 0.002952422248199582,
      "learning_rate": 9.633441408059285e-07,
      "loss": 0.0001,
      "num_tokens": 108877694.0,
      "reward": 0.7958667874336243,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7958667874336243,
      "rewards/reward_func/std": 0.0,
      "step": 3958,
      "step_time": 17.800376694649458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 343.0,
      "completions/max_terminated_length": 343.0,
      "completions/mean_length": 323.25,
      "completions/mean_terminated_length": 323.25,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "entropy": 0.11619868874549866,
      "epoch": 0.18337193144974526,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000940678408369422,
      "kl": 0.0008944468863774091,
      "learning_rate": 9.633348772579899e-07,
      "loss": 0.0,
      "num_tokens": 108907170.0,
      "reward": 0.6894580721855164,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6894580721855164,
      "rewards/reward_func/std": 0.0,
      "step": 3959,
      "step_time": 30.210374638438225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 129.5625,
      "completions/mean_terminated_length": 129.5625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2954455837607384,
      "epoch": 0.18341824918943955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016661534318700433,
      "kl": 0.001608467398909852,
      "learning_rate": 9.63325613710051e-07,
      "loss": 0.0001,
      "num_tokens": 108928395.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3960,
      "step_time": 14.256221912801266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 130.0625,
      "completions/mean_terminated_length": 130.0625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2928720563650131,
      "epoch": 0.18346456692913385,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002274853875860572,
      "kl": 0.0017797103500925004,
      "learning_rate": 9.63316350162112e-07,
      "loss": 0.0001,
      "num_tokens": 108951532.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3961,
      "step_time": 15.395901620388031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 137.3125,
      "completions/mean_terminated_length": 137.3125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.29976020008325577,
      "epoch": 0.18351088466882817,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009205182082951069,
      "kl": 0.003424542141146958,
      "learning_rate": 9.633070866141732e-07,
      "loss": 0.0002,
      "num_tokens": 108971841.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3962,
      "step_time": 15.533253353089094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 291.0,
      "completions/max_terminated_length": 291.0,
      "completions/mean_length": 215.625,
      "completions/mean_terminated_length": 215.625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.41104743629693985,
      "epoch": 0.18355720240852247,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11541653424501419,
      "kl": 0.012414152850396931,
      "learning_rate": 9.632978230662344e-07,
      "loss": 0.0011,
      "num_tokens": 108993067.0,
      "reward": 0.01748797297477722,
      "reward_std": 0.0179652851074934,
      "rewards/reward_func/mean": 0.01748797297477722,
      "rewards/reward_func/std": 0.0179652851074934,
      "step": 3963,
      "step_time": 23.979558132588863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 138.875,
      "completions/mean_terminated_length": 138.875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.33035293966531754,
      "epoch": 0.18360352014821676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003688642056658864,
      "kl": 0.0030114661203697324,
      "learning_rate": 9.632885595182955e-07,
      "loss": 0.0002,
      "num_tokens": 109013193.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3964,
      "step_time": 15.232430435717106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 179.4375,
      "completions/mean_terminated_length": 179.4375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.2221505045890808,
      "epoch": 0.18364983788791106,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003660363843664527,
      "kl": 0.00352401816053316,
      "learning_rate": 9.632792959703566e-07,
      "loss": 0.0002,
      "num_tokens": 109034832.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3965,
      "step_time": 20.020007949322462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 200.875,
      "completions/mean_terminated_length": 200.875,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.4312363564968109,
      "epoch": 0.18369615562760538,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005175455939024687,
      "kl": 0.004027256276458502,
      "learning_rate": 9.632700324224177e-07,
      "loss": 0.0002,
      "num_tokens": 109066414.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3966,
      "step_time": 22.746452674269676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 167.8125,
      "completions/mean_terminated_length": 167.8125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.2729054242372513,
      "epoch": 0.18374247336729968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009631850756704807,
      "kl": 0.00643218751065433,
      "learning_rate": 9.632607688744789e-07,
      "loss": 0.0003,
      "num_tokens": 109092651.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3967,
      "step_time": 17.775084663182497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 123.5625,
      "completions/mean_terminated_length": 123.5625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2562769018113613,
      "epoch": 0.18378879110699398,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029653096571564674,
      "kl": 0.0018083448521792889,
      "learning_rate": 9.6325150532654e-07,
      "loss": 0.0001,
      "num_tokens": 109114180.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3968,
      "step_time": 14.097911071032286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 201.375,
      "completions/mean_terminated_length": 201.375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.2452106587588787,
      "epoch": 0.18383510884668827,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1170409545302391,
      "kl": 0.02185472333803773,
      "learning_rate": 9.632422417786011e-07,
      "loss": -0.0253,
      "num_tokens": 109145978.0,
      "reward": 0.9886171817779541,
      "reward_std": 0.017437174916267395,
      "rewards/reward_func/mean": 0.9886171817779541,
      "rewards/reward_func/std": 0.017437167465686798,
      "step": 3969,
      "step_time": 22.349928356707096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 157.6875,
      "completions/mean_terminated_length": 157.6875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.1739608719944954,
      "epoch": 0.1838814265863826,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022780471481382847,
      "kl": 0.0016107640403788537,
      "learning_rate": 9.632329782306622e-07,
      "loss": 0.0001,
      "num_tokens": 109170277.0,
      "reward": 0.9394130706787109,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9394130706787109,
      "rewards/reward_func/std": 0.0,
      "step": 3970,
      "step_time": 17.16651639714837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 139.0,
      "completions/mean_terminated_length": 139.0,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.30899014323949814,
      "epoch": 0.1839277443260769,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005675748456269503,
      "kl": 0.002892179589252919,
      "learning_rate": 9.632237146827234e-07,
      "loss": 0.0001,
      "num_tokens": 109192277.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3971,
      "step_time": 15.727838676422834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 162.1875,
      "completions/mean_terminated_length": 162.1875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3050469681620598,
      "epoch": 0.1839740620657712,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003201049519702792,
      "kl": 0.0022730419295839965,
      "learning_rate": 9.632144511347847e-07,
      "loss": 0.0001,
      "num_tokens": 109214296.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3972,
      "step_time": 16.43618332967162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 207.5625,
      "completions/mean_terminated_length": 207.5625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.39574429392814636,
      "epoch": 0.18402037980546548,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1119006797671318,
      "kl": 0.009786227601580322,
      "learning_rate": 9.632051875868458e-07,
      "loss": 0.025,
      "num_tokens": 109237921.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 3973,
      "step_time": 23.224096555262804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 187.5,
      "completions/mean_terminated_length": 187.5,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.21707920357584953,
      "epoch": 0.1840666975451598,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1409747302532196,
      "kl": 0.028514136094599962,
      "learning_rate": 9.631959240389067e-07,
      "loss": -0.0607,
      "num_tokens": 109275449.0,
      "reward": 0.5498151779174805,
      "reward_std": 0.24936477839946747,
      "rewards/reward_func/mean": 0.5498151779174805,
      "rewards/reward_func/std": 0.24936480820178986,
      "step": 3974,
      "step_time": 23.64865927770734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 197.25,
      "completions/mean_terminated_length": 197.25,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.308023177087307,
      "epoch": 0.1841130152848541,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008185689337551594,
      "kl": 0.008744140737690032,
      "learning_rate": 9.631866604909679e-07,
      "loss": 0.0004,
      "num_tokens": 109300765.0,
      "reward": 0.3992621898651123,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3992621898651123,
      "rewards/reward_func/std": 0.0,
      "step": 3975,
      "step_time": 24.08602061495185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 168.25,
      "completions/mean_terminated_length": 168.25,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.18104015290737152,
      "epoch": 0.1841593330245484,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1370517462491989,
      "kl": 0.0021947959903627634,
      "learning_rate": 9.631773969430292e-07,
      "loss": -0.0095,
      "num_tokens": 109336433.0,
      "reward": 0.8720898628234863,
      "reward_std": 0.0024667978286743164,
      "rewards/reward_func/mean": 0.8720898628234863,
      "rewards/reward_func/std": 0.0024667978286743164,
      "step": 3976,
      "step_time": 20.16820090636611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 204.75,
      "completions/mean_terminated_length": 204.75,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.2309229113161564,
      "epoch": 0.1842056507642427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11665613204240799,
      "kl": 0.007314708083868027,
      "learning_rate": 9.631681333950903e-07,
      "loss": 0.0111,
      "num_tokens": 109365101.0,
      "reward": 0.7043023109436035,
      "reward_std": 0.22019338607788086,
      "rewards/reward_func/mean": 0.7043023109436035,
      "rewards/reward_func/std": 0.22019338607788086,
      "step": 3977,
      "step_time": 22.09240308776498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 166.25,
      "completions/mean_terminated_length": 166.25,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.40785839408636093,
      "epoch": 0.18425196850393702,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003447173163294792,
      "kl": 0.0027228702674619853,
      "learning_rate": 9.631588698471515e-07,
      "loss": 0.0001,
      "num_tokens": 109404561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3978,
      "step_time": 21.196227714419365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.23304549604654312,
      "epoch": 0.18429828624363132,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0072014289908111095,
      "kl": 0.00271628238260746,
      "learning_rate": 9.631496062992126e-07,
      "loss": 0.0001,
      "num_tokens": 109424985.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3979,
      "step_time": 16.092280738055706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 132.6875,
      "completions/mean_terminated_length": 132.6875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2900601103901863,
      "epoch": 0.1843446039833256,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001795622636564076,
      "kl": 0.0015797052474226803,
      "learning_rate": 9.631403427512737e-07,
      "loss": 0.0001,
      "num_tokens": 109448340.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3980,
      "step_time": 15.006452061235905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 147.5625,
      "completions/mean_terminated_length": 147.5625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.40439480543136597,
      "epoch": 0.1843909217230199,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036746393889188766,
      "kl": 0.0026609241031110287,
      "learning_rate": 9.631310792033348e-07,
      "loss": 0.0001,
      "num_tokens": 109471469.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3981,
      "step_time": 16.04165256768465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 193.875,
      "completions/mean_terminated_length": 193.875,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.16588760539889336,
      "epoch": 0.18443723946271423,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002614664612337947,
      "kl": 0.0015347810985986143,
      "learning_rate": 9.63121815655396e-07,
      "loss": 0.0001,
      "num_tokens": 109517227.0,
      "reward": 0.9181891679763794,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9181891679763794,
      "rewards/reward_func/std": 0.0,
      "step": 3982,
      "step_time": 24.127741757780313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 190.9375,
      "completions/mean_terminated_length": 190.9375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.20520596951246262,
      "epoch": 0.18448355720240853,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09583383798599243,
      "kl": 0.0028633428155444562,
      "learning_rate": 9.63112552107457e-07,
      "loss": 0.0318,
      "num_tokens": 109540026.0,
      "reward": 0.9898674488067627,
      "reward_std": 0.02178443968296051,
      "rewards/reward_func/mean": 0.9898674488067627,
      "rewards/reward_func/std": 0.02178444340825081,
      "step": 3983,
      "step_time": 19.54361553490162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 308.0,
      "completions/max_terminated_length": 308.0,
      "completions/mean_length": 273.3125,
      "completions/mean_terminated_length": 273.3125,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "entropy": 0.17532089725136757,
      "epoch": 0.18452987494210282,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11145468056201935,
      "kl": 0.02424334827810526,
      "learning_rate": 9.631032885595182e-07,
      "loss": -0.0232,
      "num_tokens": 109565951.0,
      "reward": 0.9905068874359131,
      "reward_std": 0.012657547369599342,
      "rewards/reward_func/mean": 0.9905068874359131,
      "rewards/reward_func/std": 0.012657553888857365,
      "step": 3984,
      "step_time": 26.740829281508923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 192.25,
      "completions/mean_terminated_length": 192.25,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4750644564628601,
      "epoch": 0.18457619268179712,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004511016421020031,
      "kl": 0.0034640480880625546,
      "learning_rate": 9.630940250115795e-07,
      "loss": 0.0002,
      "num_tokens": 109591043.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3985,
      "step_time": 21.56521673128009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 122.625,
      "completions/mean_terminated_length": 122.625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.28628551959991455,
      "epoch": 0.18462251042149144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01212072093039751,
      "kl": 0.0029937040817458183,
      "learning_rate": 9.630847614636405e-07,
      "loss": 0.0002,
      "num_tokens": 109611773.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3986,
      "step_time": 12.988273493945599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 127.5,
      "completions/mean_terminated_length": 127.5,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.1995968595147133,
      "epoch": 0.18466882816118574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005069428123533726,
      "kl": 0.002444858255330473,
      "learning_rate": 9.630754979157016e-07,
      "loss": 0.0001,
      "num_tokens": 109631237.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3987,
      "step_time": 13.890676397830248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 151.0625,
      "completions/mean_terminated_length": 151.0625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.33534102141857147,
      "epoch": 0.18471514590088003,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027537329588085413,
      "kl": 0.002736641326919198,
      "learning_rate": 9.630662343677627e-07,
      "loss": 0.0001,
      "num_tokens": 109656006.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3988,
      "step_time": 17.098580598831177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 134.125,
      "completions/mean_terminated_length": 134.125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3812548294663429,
      "epoch": 0.18476146364057433,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005437885876744986,
      "kl": 0.0031718131504021585,
      "learning_rate": 9.63056970819824e-07,
      "loss": 0.0002,
      "num_tokens": 109679448.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3989,
      "step_time": 15.155997782945633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 134.25,
      "completions/mean_terminated_length": 134.25,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.18073805794119835,
      "epoch": 0.18480778138026865,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018739913357421756,
      "kl": 0.001899892173241824,
      "learning_rate": 9.630477072718852e-07,
      "loss": 0.0001,
      "num_tokens": 109702124.0,
      "reward": 0.6431870460510254,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6431870460510254,
      "rewards/reward_func/std": 0.0,
      "step": 3990,
      "step_time": 15.018368661403656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 182.8125,
      "completions/mean_terminated_length": 182.8125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.31624697893857956,
      "epoch": 0.18485409911996295,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12621982395648956,
      "kl": 0.010969286668114364,
      "learning_rate": 9.630384437239463e-07,
      "loss": -0.0211,
      "num_tokens": 109723609.0,
      "reward": 0.30878984928131104,
      "reward_std": 0.4730664789676666,
      "rewards/reward_func/mean": 0.30878984928131104,
      "rewards/reward_func/std": 0.4730664789676666,
      "step": 3991,
      "step_time": 19.846620678901672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 149.375,
      "completions/mean_terminated_length": 149.375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.36934104561805725,
      "epoch": 0.18490041685965725,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005815388169139624,
      "kl": 0.0047956041526049376,
      "learning_rate": 9.630291801760074e-07,
      "loss": 0.0002,
      "num_tokens": 109759727.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3992,
      "step_time": 19.237177781760693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 157.0625,
      "completions/mean_terminated_length": 157.0625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.39627500623464584,
      "epoch": 0.18494673459935154,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027054885867983103,
      "kl": 0.0024817449739202857,
      "learning_rate": 9.630199166280685e-07,
      "loss": 0.0001,
      "num_tokens": 109804080.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3993,
      "step_time": 25.613412898033857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 138.5,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.25153781473636627,
      "epoch": 0.18499305233904587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006807493511587381,
      "kl": 0.0034551440621726215,
      "learning_rate": 9.630106530801297e-07,
      "loss": 0.0002,
      "num_tokens": 109824216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3994,
      "step_time": 14.718906585127115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 164.75,
      "completions/mean_terminated_length": 164.75,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.1745079718530178,
      "epoch": 0.18503937007874016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036642674822360277,
      "kl": 0.002100349054671824,
      "learning_rate": 9.630013895321908e-07,
      "loss": 0.0001,
      "num_tokens": 109850052.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 3995,
      "step_time": 19.60317961871624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 167.625,
      "completions/mean_terminated_length": 167.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.43344592303037643,
      "epoch": 0.18508568781843446,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022423649206757545,
      "kl": 0.0022812695242464542,
      "learning_rate": 9.62992125984252e-07,
      "loss": 0.0001,
      "num_tokens": 109901406.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 3996,
      "step_time": 25.30669067800045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 208.0625,
      "completions/mean_terminated_length": 208.0625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.20702053233981133,
      "epoch": 0.18513200555812875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035511634778231382,
      "kl": 0.010278586996719241,
      "learning_rate": 9.62982862436313e-07,
      "loss": 0.0005,
      "num_tokens": 109924015.0,
      "reward": 0.6726685166358948,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6726685166358948,
      "rewards/reward_func/std": 0.0,
      "step": 3997,
      "step_time": 21.41838786378503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2692618891596794,
      "epoch": 0.18517832329782308,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005457677878439426,
      "kl": 0.004745265352539718,
      "learning_rate": 9.629735988883742e-07,
      "loss": 0.0002,
      "num_tokens": 109948218.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3998,
      "step_time": 19.842437531799078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 418.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 274.625,
      "completions/mean_terminated_length": 274.625,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "entropy": 0.22472453117370605,
      "epoch": 0.18522464103751737,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007122904062271118,
      "kl": 0.0067714008037000895,
      "learning_rate": 9.629643353404353e-07,
      "loss": 0.0003,
      "num_tokens": 109983012.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 3999,
      "step_time": 35.84325436875224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 278.5,
      "completions/mean_terminated_length": 278.5,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "entropy": 0.29957157373428345,
      "epoch": 0.18527095877721167,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08676525950431824,
      "kl": 0.007300138706341386,
      "learning_rate": 9.629550717924964e-07,
      "loss": -0.0123,
      "num_tokens": 110013420.0,
      "reward": 0.7593790292739868,
      "reward_std": 0.38766154646873474,
      "rewards/reward_func/mean": 0.7593790292739868,
      "rewards/reward_func/std": 0.38766157627105713,
      "step": 4000,
      "step_time": 28.48399420455098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 130.8125,
      "completions/mean_terminated_length": 130.8125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.24999059736728668,
      "epoch": 0.18531727651690597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00222027231939137,
      "kl": 0.0014751394337508827,
      "learning_rate": 9.629458082445575e-07,
      "loss": 0.0001,
      "num_tokens": 110033033.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4001,
      "step_time": 13.585205253213644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 166.875,
      "completions/mean_terminated_length": 166.875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3456791341304779,
      "epoch": 0.1853635942566003,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006335800979286432,
      "kl": 0.00560496945399791,
      "learning_rate": 9.629365446966189e-07,
      "loss": 0.0003,
      "num_tokens": 110056103.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4002,
      "step_time": 19.210248716175556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 129.125,
      "completions/mean_terminated_length": 129.125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.21131492778658867,
      "epoch": 0.18540991199629459,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027449503540992737,
      "kl": 0.001373827486531809,
      "learning_rate": 9.6292728114868e-07,
      "loss": 0.0001,
      "num_tokens": 110077577.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4003,
      "step_time": 14.045453313738108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 109.375,
      "completions/mean_terminated_length": 109.375,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.27837956696748734,
      "epoch": 0.18545622973598888,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006750398315489292,
      "kl": 0.004190101637504995,
      "learning_rate": 9.629180176007411e-07,
      "loss": 0.0002,
      "num_tokens": 110096799.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4004,
      "step_time": 13.59730527177453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 217.1875,
      "completions/mean_terminated_length": 217.1875,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.16105342656373978,
      "epoch": 0.18550254747568318,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005388450343161821,
      "kl": 0.004298588493838906,
      "learning_rate": 9.62908754052802e-07,
      "loss": 0.0002,
      "num_tokens": 110120658.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4005,
      "step_time": 20.983232606202364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 122.625,
      "completions/mean_terminated_length": 122.625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.31485600769519806,
      "epoch": 0.1855488652153775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034729966428130865,
      "kl": 0.002158529096050188,
      "learning_rate": 9.628994905048634e-07,
      "loss": 0.0001,
      "num_tokens": 110142540.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4006,
      "step_time": 13.971363630145788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.37800244241952896,
      "epoch": 0.1855951829550718,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002700837329030037,
      "kl": 0.0019029233371838927,
      "learning_rate": 9.628902269569245e-07,
      "loss": 0.0001,
      "num_tokens": 110177631.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4007,
      "step_time": 21.014577466994524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 190.5,
      "completions/mean_terminated_length": 190.5,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.1682998165488243,
      "epoch": 0.1856415006947661,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00239096162840724,
      "kl": 0.002248078992124647,
      "learning_rate": 9.628809634089856e-07,
      "loss": 0.0001,
      "num_tokens": 110205863.0,
      "reward": 0.9364250898361206,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9364250898361206,
      "rewards/reward_func/std": 0.0,
      "step": 4008,
      "step_time": 20.772039148956537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 128.75,
      "completions/mean_terminated_length": 128.75,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2897027060389519,
      "epoch": 0.1856878184344604,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018700878135859966,
      "kl": 0.0014238965668482706,
      "learning_rate": 9.628716998610468e-07,
      "loss": 0.0001,
      "num_tokens": 110230131.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4009,
      "step_time": 14.397155273705721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 216.6875,
      "completions/mean_terminated_length": 216.6875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.23186129331588745,
      "epoch": 0.1857341361741547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01045618299394846,
      "kl": 0.026143222115933895,
      "learning_rate": 9.628624363131079e-07,
      "loss": 0.0013,
      "num_tokens": 110253662.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4010,
      "step_time": 20.775734931230545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 150.5,
      "completions/mean_terminated_length": 150.5,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.23873531445860863,
      "epoch": 0.185780453913849,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14374826848506927,
      "kl": 0.00922016124241054,
      "learning_rate": 9.62853172765169e-07,
      "loss": -0.0157,
      "num_tokens": 110276806.0,
      "reward": 0.11073227226734161,
      "reward_std": 0.2918255031108856,
      "rewards/reward_func/mean": 0.11073227226734161,
      "rewards/reward_func/std": 0.2918255031108856,
      "step": 4011,
      "step_time": 16.624506752938032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 128.625,
      "completions/mean_terminated_length": 128.625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.326849564909935,
      "epoch": 0.1858267716535433,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029834227170795202,
      "kl": 0.0025584730319678783,
      "learning_rate": 9.628439092172301e-07,
      "loss": 0.0001,
      "num_tokens": 110301152.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4012,
      "step_time": 15.393717229366302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 214.9375,
      "completions/mean_terminated_length": 214.9375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.303561232984066,
      "epoch": 0.1858730893932376,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08768731355667114,
      "kl": 0.00878506456501782,
      "learning_rate": 9.628346456692913e-07,
      "loss": 0.0508,
      "num_tokens": 110339519.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4013,
      "step_time": 25.936167631298304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 357.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 287.75,
      "completions/mean_terminated_length": 287.75,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "entropy": 0.44467893242836,
      "epoch": 0.18591940713293192,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09070470184087753,
      "kl": 0.00987121183425188,
      "learning_rate": 9.628253821213524e-07,
      "loss": -0.0279,
      "num_tokens": 110362539.0,
      "reward": 0.758585512638092,
      "reward_std": 0.38068443536758423,
      "rewards/reward_func/mean": 0.758585512638092,
      "rewards/reward_func/std": 0.38068443536758423,
      "step": 4014,
      "step_time": 28.47284570708871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 127.5,
      "completions/mean_terminated_length": 127.5,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.13524066284298897,
      "epoch": 0.18596572487262622,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004189995117485523,
      "kl": 0.002879993262467906,
      "learning_rate": 9.628161185734135e-07,
      "loss": 0.0001,
      "num_tokens": 110387731.0,
      "reward": 0.3441537916660309,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3441537916660309,
      "rewards/reward_func/std": 0.0,
      "step": 4015,
      "step_time": 15.309558905661106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 169.25,
      "completions/mean_terminated_length": 169.25,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.23994186893105507,
      "epoch": 0.18601204261232052,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.19447307288646698,
      "kl": 0.03730075154453516,
      "learning_rate": 9.628068550254748e-07,
      "loss": 0.0019,
      "num_tokens": 110410375.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 4016,
      "step_time": 19.559921495616436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 166.9375,
      "completions/mean_terminated_length": 166.9375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.25637052953243256,
      "epoch": 0.1860583603520148,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004210201092064381,
      "kl": 0.0032640844001434743,
      "learning_rate": 9.627975914775358e-07,
      "loss": 0.0002,
      "num_tokens": 110431078.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 4017,
      "step_time": 17.773968800902367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 348.0,
      "completions/max_terminated_length": 348.0,
      "completions/mean_length": 249.125,
      "completions/mean_terminated_length": 249.125,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.3417166396975517,
      "epoch": 0.18610467809170914,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08424817025661469,
      "kl": 0.014773907605558634,
      "learning_rate": 9.627883279295969e-07,
      "loss": -0.0777,
      "num_tokens": 110458632.0,
      "reward": 0.4639071226119995,
      "reward_std": 0.360614538192749,
      "rewards/reward_func/mean": 0.4639071226119995,
      "rewards/reward_func/std": 0.3606145679950714,
      "step": 4018,
      "step_time": 29.189542088657618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 183.9375,
      "completions/mean_terminated_length": 183.9375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3959822431206703,
      "epoch": 0.18615099583140343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004899558611214161,
      "kl": 0.004724961938336492,
      "learning_rate": 9.627790643816582e-07,
      "loss": 0.0002,
      "num_tokens": 110480439.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4019,
      "step_time": 18.707082144916058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 168.4375,
      "completions/mean_terminated_length": 168.4375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.2049410603940487,
      "epoch": 0.18619731357109773,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008570501580834389,
      "kl": 0.008052483783103526,
      "learning_rate": 9.627698008337193e-07,
      "loss": 0.0004,
      "num_tokens": 110503214.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 4020,
      "step_time": 17.276382356882095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 235.9375,
      "completions/mean_terminated_length": 235.9375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.24076197668910027,
      "epoch": 0.18624363131079202,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0847514346241951,
      "kl": 0.011526466347277164,
      "learning_rate": 9.627605372857805e-07,
      "loss": -0.0824,
      "num_tokens": 110541789.0,
      "reward": 0.6347875595092773,
      "reward_std": 0.3249886929988861,
      "rewards/reward_func/mean": 0.6347875595092773,
      "rewards/reward_func/std": 0.3249886929988861,
      "step": 4021,
      "step_time": 26.530835896730423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 240.625,
      "completions/mean_terminated_length": 240.625,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "entropy": 0.25206584483385086,
      "epoch": 0.18628994905048635,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09857936203479767,
      "kl": 0.032442583702504635,
      "learning_rate": 9.627512737378416e-07,
      "loss": 0.0109,
      "num_tokens": 110580503.0,
      "reward": 0.9319667816162109,
      "reward_std": 0.2495037168264389,
      "rewards/reward_func/mean": 0.9319667816162109,
      "rewards/reward_func/std": 0.2495037168264389,
      "step": 4022,
      "step_time": 29.375940918922424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 184.3125,
      "completions/mean_terminated_length": 184.3125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.25172628462314606,
      "epoch": 0.18633626679018064,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008962012827396393,
      "kl": 0.006980260368436575,
      "learning_rate": 9.627420101899027e-07,
      "loss": 0.0003,
      "num_tokens": 110613516.0,
      "reward": 0.8742223381996155,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8742223381996155,
      "rewards/reward_func/std": 0.0,
      "step": 4023,
      "step_time": 23.098502170294523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 201.5625,
      "completions/mean_terminated_length": 201.5625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.24033070728182793,
      "epoch": 0.18638258452987494,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002025859896093607,
      "kl": 0.0017823771049734205,
      "learning_rate": 9.627327466419638e-07,
      "loss": 0.0001,
      "num_tokens": 110668325.0,
      "reward": 0.5623413324356079,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5623413324356079,
      "rewards/reward_func/std": 0.0,
      "step": 4024,
      "step_time": 27.585756354033947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 222.5,
      "completions/mean_terminated_length": 222.5,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "entropy": 0.21306084096431732,
      "epoch": 0.18642890226956924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013144618133082986,
      "kl": 0.001131007564254105,
      "learning_rate": 9.62723483094025e-07,
      "loss": 0.0001,
      "num_tokens": 110701565.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4025,
      "step_time": 24.947600785642862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 166.3125,
      "completions/mean_terminated_length": 166.3125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.4107777997851372,
      "epoch": 0.18647522000926356,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022324291057884693,
      "kl": 0.0021487894118763506,
      "learning_rate": 9.62714219546086e-07,
      "loss": 0.0001,
      "num_tokens": 110733154.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4026,
      "step_time": 19.024696111679077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 188.875,
      "completions/mean_terminated_length": 188.875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.17104016616940498,
      "epoch": 0.18652153774895786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002095515839755535,
      "kl": 0.0012917412968818098,
      "learning_rate": 9.627049559981472e-07,
      "loss": 0.0001,
      "num_tokens": 110764256.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 4027,
      "step_time": 20.831158369779587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 141.375,
      "completions/mean_terminated_length": 141.375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.26488659530878067,
      "epoch": 0.18656785548865215,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005742703098803759,
      "kl": 0.003191460156813264,
      "learning_rate": 9.626956924502083e-07,
      "loss": 0.0002,
      "num_tokens": 110784102.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4028,
      "step_time": 16.903685934841633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 233.625,
      "completions/mean_terminated_length": 233.625,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.2235318385064602,
      "epoch": 0.18661417322834645,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19104117155075073,
      "kl": 0.004938063560985029,
      "learning_rate": 9.626864289022695e-07,
      "loss": -0.0354,
      "num_tokens": 110817136.0,
      "reward": 0.5958160161972046,
      "reward_std": 0.10778238624334335,
      "rewards/reward_func/mean": 0.5958160161972046,
      "rewards/reward_func/std": 0.10778239369392395,
      "step": 4029,
      "step_time": 24.90700952708721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 171.375,
      "completions/mean_terminated_length": 171.375,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.256470900028944,
      "epoch": 0.18666049096804077,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0041847629472613335,
      "kl": 0.004776358720846474,
      "learning_rate": 9.626771653543306e-07,
      "loss": 0.0002,
      "num_tokens": 110839766.0,
      "reward": 0.1910344958305359,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.1910344958305359,
      "rewards/reward_func/std": 0.0,
      "step": 4030,
      "step_time": 17.760415386408567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 174.625,
      "completions/mean_terminated_length": 174.625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.4725622311234474,
      "epoch": 0.18670680870773507,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038015469908714294,
      "kl": 0.0030454796506091952,
      "learning_rate": 9.626679018063917e-07,
      "loss": 0.0002,
      "num_tokens": 110861312.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4031,
      "step_time": 18.09712029993534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 210.25,
      "completions/mean_terminated_length": 210.25,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.37680091708898544,
      "epoch": 0.18675312644742936,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0995647981762886,
      "kl": 0.008350224932655692,
      "learning_rate": 9.62658638258453e-07,
      "loss": 0.0737,
      "num_tokens": 110894404.0,
      "reward": 0.4639376401901245,
      "reward_std": 0.47846540808677673,
      "rewards/reward_func/mean": 0.4639376401901245,
      "rewards/reward_func/std": 0.4784654676914215,
      "step": 4032,
      "step_time": 26.580738559365273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 222.75,
      "completions/mean_terminated_length": 222.75,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.33753830194473267,
      "epoch": 0.18679944418712366,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10607600957155228,
      "kl": 0.013644204940646887,
      "learning_rate": 9.626493747105142e-07,
      "loss": -0.0514,
      "num_tokens": 110919616.0,
      "reward": 0.6257338523864746,
      "reward_std": 0.16443493962287903,
      "rewards/reward_func/mean": 0.6257338523864746,
      "rewards/reward_func/std": 0.16443493962287903,
      "step": 4033,
      "step_time": 23.17309584468603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 267.0,
      "completions/mean_terminated_length": 267.0,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "entropy": 0.33476605266332626,
      "epoch": 0.18684576192681798,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09008869528770447,
      "kl": 0.010737331118434668,
      "learning_rate": 9.626401111625753e-07,
      "loss": -0.0856,
      "num_tokens": 110947216.0,
      "reward": 0.36792483925819397,
      "reward_std": 0.2843818664550781,
      "rewards/reward_func/mean": 0.36792483925819397,
      "rewards/reward_func/std": 0.2843818962574005,
      "step": 4034,
      "step_time": 28.35039295628667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 240.75,
      "completions/mean_terminated_length": 240.75,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "entropy": 0.23919912800192833,
      "epoch": 0.18689207966651228,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12189947813749313,
      "kl": 0.030286923982203007,
      "learning_rate": 9.626308476146362e-07,
      "loss": 0.0012,
      "num_tokens": 110984316.0,
      "reward": 0.38853946328163147,
      "reward_std": 0.012802098877727985,
      "rewards/reward_func/mean": 0.38853946328163147,
      "rewards/reward_func/std": 0.012802098877727985,
      "step": 4035,
      "step_time": 27.309667088091373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 145.75,
      "completions/mean_terminated_length": 145.75,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3725769445300102,
      "epoch": 0.18693839740620657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003245528554543853,
      "kl": 0.0025223796255886555,
      "learning_rate": 9.626215840666975e-07,
      "loss": 0.0001,
      "num_tokens": 111020408.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4036,
      "step_time": 18.651448875665665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 134.375,
      "completions/mean_terminated_length": 134.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.3189077600836754,
      "epoch": 0.18698471514590087,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030886817257851362,
      "kl": 0.0024825698928907514,
      "learning_rate": 9.626123205187587e-07,
      "loss": 0.0001,
      "num_tokens": 111045342.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4037,
      "step_time": 16.30925925076008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 161.625,
      "completions/mean_terminated_length": 161.625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.26514882594347,
      "epoch": 0.1870310328855952,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29529812932014465,
      "kl": 0.015383483376353979,
      "learning_rate": 9.626030569708198e-07,
      "loss": 0.1239,
      "num_tokens": 111066840.0,
      "reward": 0.3272396922111511,
      "reward_std": 0.26179173588752747,
      "rewards/reward_func/mean": 0.3272396922111511,
      "rewards/reward_func/std": 0.26179173588752747,
      "step": 4038,
      "step_time": 20.36912925541401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 361.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 240.625,
      "completions/mean_terminated_length": 240.625,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.478082574903965,
      "epoch": 0.1870773506252895,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1142750084400177,
      "kl": 0.0057019483065232635,
      "learning_rate": 9.62593793422881e-07,
      "loss": 0.2009,
      "num_tokens": 111091666.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 4039,
      "step_time": 29.76534355804324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 152.625,
      "completions/mean_terminated_length": 152.625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3431262820959091,
      "epoch": 0.1871236683649838,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00506778247654438,
      "kl": 0.003924763586837798,
      "learning_rate": 9.62584529874942e-07,
      "loss": 0.0002,
      "num_tokens": 111114412.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4040,
      "step_time": 16.78785802423954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 145.0,
      "completions/mean_terminated_length": 145.0,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.36675096303224564,
      "epoch": 0.18716998610467808,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029439309146255255,
      "kl": 0.0022743356821592897,
      "learning_rate": 9.625752663270032e-07,
      "loss": 0.0001,
      "num_tokens": 111144844.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4041,
      "step_time": 19.179258815944195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.2295929342508316,
      "epoch": 0.1872163038443724,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1008884459733963,
      "kl": 0.00183072779327631,
      "learning_rate": 9.625660027790643e-07,
      "loss": -0.0112,
      "num_tokens": 111167441.0,
      "reward": 0.9814902544021606,
      "reward_std": 0.033111222088336945,
      "rewards/reward_func/mean": 0.9814902544021606,
      "rewards/reward_func/std": 0.03311121463775635,
      "step": 4042,
      "step_time": 20.266985408961773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 155.0625,
      "completions/mean_terminated_length": 155.0625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3718869537115097,
      "epoch": 0.1872626215840667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004450637381523848,
      "kl": 0.003298588388133794,
      "learning_rate": 9.625567392311254e-07,
      "loss": 0.0002,
      "num_tokens": 111203794.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4043,
      "step_time": 19.718469090759754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 137.625,
      "completions/mean_terminated_length": 137.625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.23140939697623253,
      "epoch": 0.187308939323761,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004221647512167692,
      "kl": 0.0023980710247997195,
      "learning_rate": 9.625474756831865e-07,
      "loss": 0.0001,
      "num_tokens": 111224588.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4044,
      "step_time": 15.231382239609957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 191.25,
      "completions/mean_terminated_length": 191.25,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.18196557834744453,
      "epoch": 0.1873552570634553,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002249514451250434,
      "kl": 0.0017824685492087156,
      "learning_rate": 9.625382121352477e-07,
      "loss": 0.0001,
      "num_tokens": 111247680.0,
      "reward": 0.8657099008560181,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8657099008560181,
      "rewards/reward_func/std": 0.0,
      "step": 4045,
      "step_time": 22.431513603776693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 196.5625,
      "completions/mean_terminated_length": 196.5625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.40521497279405594,
      "epoch": 0.18740157480314962,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12528221309185028,
      "kl": 0.011679814429953694,
      "learning_rate": 9.62528948587309e-07,
      "loss": -0.0223,
      "num_tokens": 111286713.0,
      "reward": 0.0009923388715833426,
      "reward_std": 0.00396935548633337,
      "rewards/reward_func/mean": 0.0009923388715833426,
      "rewards/reward_func/std": 0.00396935548633337,
      "step": 4046,
      "step_time": 23.81155974417925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 120.3125,
      "completions/mean_terminated_length": 120.3125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.257652398198843,
      "epoch": 0.1874478925428439,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028297470416873693,
      "kl": 0.0019749358762055635,
      "learning_rate": 9.625196850393701e-07,
      "loss": 0.0001,
      "num_tokens": 111306862.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4047,
      "step_time": 12.967925600707531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 177.0625,
      "completions/mean_terminated_length": 177.0625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2123894840478897,
      "epoch": 0.1874942102825382,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10444404184818268,
      "kl": 0.01974491006694734,
      "learning_rate": 9.62510421491431e-07,
      "loss": 0.0177,
      "num_tokens": 111350639.0,
      "reward": 0.965315580368042,
      "reward_std": 0.05313253030180931,
      "rewards/reward_func/mean": 0.965315580368042,
      "rewards/reward_func/std": 0.05313252657651901,
      "step": 4048,
      "step_time": 24.339804265648127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 663.0,
      "completions/max_terminated_length": 663.0,
      "completions/mean_length": 256.8125,
      "completions/mean_terminated_length": 256.8125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.44845743477344513,
      "epoch": 0.1875405280222325,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10149309039115906,
      "kl": 0.005505937500856817,
      "learning_rate": 9.625011579434924e-07,
      "loss": 0.3396,
      "num_tokens": 111381260.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4049,
      "step_time": 51.83715748041868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 189.125,
      "completions/mean_terminated_length": 189.125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.41343196481466293,
      "epoch": 0.18758684576192683,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012865274213254452,
      "kl": 0.00894459243863821,
      "learning_rate": 9.624918943955535e-07,
      "loss": 0.0004,
      "num_tokens": 111411230.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4050,
      "step_time": 23.866840578615665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 185.4375,
      "completions/mean_terminated_length": 185.4375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4249361529946327,
      "epoch": 0.18763316350162113,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13789379596710205,
      "kl": 0.005413910665083677,
      "learning_rate": 9.624826308476146e-07,
      "loss": -0.0281,
      "num_tokens": 111432597.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 4051,
      "step_time": 19.753078617155552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 303.0,
      "completions/max_terminated_length": 303.0,
      "completions/mean_length": 226.1875,
      "completions/mean_terminated_length": 226.1875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.3317742869257927,
      "epoch": 0.18767948124131542,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09370024502277374,
      "kl": 0.018835734808817506,
      "learning_rate": 9.624733672996758e-07,
      "loss": -0.1381,
      "num_tokens": 111469528.0,
      "reward": 0.18570634722709656,
      "reward_std": 0.2210385501384735,
      "rewards/reward_func/mean": 0.18570634722709656,
      "rewards/reward_func/std": 0.2210385650396347,
      "step": 4052,
      "step_time": 29.399143770337105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 217.0,
      "completions/mean_terminated_length": 217.0,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.26461541652679443,
      "epoch": 0.18772579898100972,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07868076115846634,
      "kl": 0.010406596818938851,
      "learning_rate": 9.624641037517369e-07,
      "loss": 0.0148,
      "num_tokens": 111495064.0,
      "reward": 0.5141167640686035,
      "reward_std": 0.21188393235206604,
      "rewards/reward_func/mean": 0.5141167640686035,
      "rewards/reward_func/std": 0.21188391745090485,
      "step": 4053,
      "step_time": 21.481164783239365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 104.8125,
      "completions/mean_terminated_length": 104.8125,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 0.32731233537197113,
      "epoch": 0.18777211672070404,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005049731582403183,
      "kl": 0.0022659313399344683,
      "learning_rate": 9.62454840203798e-07,
      "loss": 0.0001,
      "num_tokens": 111518053.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4054,
      "step_time": 13.79790012165904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 129.8125,
      "completions/mean_terminated_length": 129.8125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3062795028090477,
      "epoch": 0.18781843446039834,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026553329080343246,
      "kl": 0.0020594959205482155,
      "learning_rate": 9.624455766558591e-07,
      "loss": 0.0001,
      "num_tokens": 111543938.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4055,
      "step_time": 16.338610626757145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 158.5,
      "completions/mean_terminated_length": 158.5,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.32172737270593643,
      "epoch": 0.18786475220009263,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005706567317247391,
      "kl": 0.003369259589817375,
      "learning_rate": 9.624363131079203e-07,
      "loss": 0.0002,
      "num_tokens": 111564506.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4056,
      "step_time": 16.800219353288412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 231.625,
      "completions/mean_terminated_length": 231.625,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.2548220008611679,
      "epoch": 0.18791106993978693,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09054199606180191,
      "kl": 0.016585303004831076,
      "learning_rate": 9.624270495599814e-07,
      "loss": -0.0459,
      "num_tokens": 111590836.0,
      "reward": 0.3605729341506958,
      "reward_std": 0.25982192158699036,
      "rewards/reward_func/mean": 0.3605729341506958,
      "rewards/reward_func/std": 0.25982192158699036,
      "step": 4057,
      "step_time": 25.715322334319353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 164.625,
      "completions/mean_terminated_length": 164.625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.17664026096463203,
      "epoch": 0.18795738767948125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039227064698934555,
      "kl": 0.0031869065715000033,
      "learning_rate": 9.624177860120425e-07,
      "loss": 0.0002,
      "num_tokens": 111617678.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4058,
      "step_time": 18.212814670056105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 181.5,
      "completions/mean_terminated_length": 181.5,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.37588997185230255,
      "epoch": 0.18800370541917555,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11496306955814362,
      "kl": 0.006013693870045245,
      "learning_rate": 9.624085224641038e-07,
      "loss": 0.0163,
      "num_tokens": 111665462.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 4059,
      "step_time": 25.726170733571053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 125.8125,
      "completions/mean_terminated_length": 125.8125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.31233925372362137,
      "epoch": 0.18805002315886984,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002167275408282876,
      "kl": 0.001902395742945373,
      "learning_rate": 9.623992589161648e-07,
      "loss": 0.0001,
      "num_tokens": 111685715.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4060,
      "step_time": 13.839465118944645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 189.8125,
      "completions/mean_terminated_length": 189.8125,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.22551383078098297,
      "epoch": 0.18809634089856414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001395753468386829,
      "kl": 0.0014026453718543053,
      "learning_rate": 9.623899953682259e-07,
      "loss": 0.0001,
      "num_tokens": 111740208.0,
      "reward": 0.6803749203681946,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6803749203681946,
      "rewards/reward_func/std": 0.0,
      "step": 4061,
      "step_time": 30.20871962234378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 270.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 233.1875,
      "completions/mean_terminated_length": 233.1875,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.22690307348966599,
      "epoch": 0.18814265863825846,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1081831157207489,
      "kl": 0.012482823571190238,
      "learning_rate": 9.623807318202872e-07,
      "loss": -0.0546,
      "num_tokens": 111764051.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4062,
      "step_time": 23.088761750608683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 197.4375,
      "completions/mean_terminated_length": 197.4375,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.4009406939148903,
      "epoch": 0.18818897637795276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004363492596894503,
      "kl": 0.003706438699737191,
      "learning_rate": 9.623714682723483e-07,
      "loss": 0.0002,
      "num_tokens": 111795946.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4063,
      "step_time": 21.760704543441534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 196.0625,
      "completions/mean_terminated_length": 196.0625,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.20653334259986877,
      "epoch": 0.18823529411764706,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037432056851685047,
      "kl": 0.0021062337327748537,
      "learning_rate": 9.623622047244095e-07,
      "loss": 0.0001,
      "num_tokens": 111834331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4064,
      "step_time": 22.563964564353228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 178.8125,
      "completions/mean_terminated_length": 178.8125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.4411425143480301,
      "epoch": 0.18828161185734135,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034896032884716988,
      "kl": 0.003495117765851319,
      "learning_rate": 9.623529411764706e-07,
      "loss": 0.0002,
      "num_tokens": 111881560.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4065,
      "step_time": 24.425624758005142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 195.5625,
      "completions/mean_terminated_length": 195.5625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.3708682432770729,
      "epoch": 0.18832792959703568,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008655146695673466,
      "kl": 0.008445442072115839,
      "learning_rate": 9.623436776285317e-07,
      "loss": 0.0004,
      "num_tokens": 111907985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4066,
      "step_time": 21.677610144019127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 206.0,
      "completions/mean_terminated_length": 206.0,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.21363132447004318,
      "epoch": 0.18837424733672997,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0956958532333374,
      "kl": 0.061965879052877426,
      "learning_rate": 9.623344140805928e-07,
      "loss": -0.0308,
      "num_tokens": 111929761.0,
      "reward": 0.9857840538024902,
      "reward_std": 0.056863654404878616,
      "rewards/reward_func/mean": 0.9857840538024902,
      "rewards/reward_func/std": 0.056863654404878616,
      "step": 4067,
      "step_time": 19.852060932666063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 127.375,
      "completions/mean_terminated_length": 127.375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.25247569754719734,
      "epoch": 0.18842056507642427,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01273462362587452,
      "kl": 0.004691361915320158,
      "learning_rate": 9.62325150532654e-07,
      "loss": 0.0002,
      "num_tokens": 111949223.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4068,
      "step_time": 13.748459201306105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 160.0,
      "completions/mean_terminated_length": 160.0,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.22371292486786842,
      "epoch": 0.18846688281611856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21906466782093048,
      "kl": 0.017677327734418213,
      "learning_rate": 9.62315886984715e-07,
      "loss": 0.0066,
      "num_tokens": 111969607.0,
      "reward": 0.9081355333328247,
      "reward_std": 0.2510214149951935,
      "rewards/reward_func/mean": 0.9081355333328247,
      "rewards/reward_func/std": 0.2510214149951935,
      "step": 4069,
      "step_time": 16.426375936716795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 132.25,
      "completions/mean_terminated_length": 132.25,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2735486924648285,
      "epoch": 0.1885132005558129,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004998963326215744,
      "kl": 0.0028393929824233055,
      "learning_rate": 9.623066234367762e-07,
      "loss": 0.0001,
      "num_tokens": 111989355.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4070,
      "step_time": 14.137649320065975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 305.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 258.4375,
      "completions/mean_terminated_length": 258.4375,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "entropy": 0.4584140181541443,
      "epoch": 0.18855951829550718,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09336674213409424,
      "kl": 0.0033523670863360167,
      "learning_rate": 9.622973598888373e-07,
      "loss": 0.0238,
      "num_tokens": 112017762.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 4071,
      "step_time": 26.69462824985385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 126.0625,
      "completions/mean_terminated_length": 126.0625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.28217051923274994,
      "epoch": 0.18860583603520148,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008498135954141617,
      "kl": 0.0033046818571165204,
      "learning_rate": 9.622880963408985e-07,
      "loss": 0.0002,
      "num_tokens": 112037571.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4072,
      "step_time": 13.568863987922668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 187.3125,
      "completions/mean_terminated_length": 187.3125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4118848964571953,
      "epoch": 0.18865215377489578,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15025535225868225,
      "kl": 0.007421009591780603,
      "learning_rate": 9.622788327929596e-07,
      "loss": -0.0591,
      "num_tokens": 112059032.0,
      "reward": 0.25,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 4073,
      "step_time": 19.93145489320159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 212.9375,
      "completions/mean_terminated_length": 212.9375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.2828494608402252,
      "epoch": 0.1886984715145901,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09432634711265564,
      "kl": 0.012376365251839161,
      "learning_rate": 9.622695692450207e-07,
      "loss": 0.0348,
      "num_tokens": 112088695.0,
      "reward": 0.09933918714523315,
      "reward_std": 0.026740100234746933,
      "rewards/reward_func/mean": 0.09933918714523315,
      "rewards/reward_func/std": 0.026740102097392082,
      "step": 4074,
      "step_time": 26.48119631409645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 178.4375,
      "completions/mean_terminated_length": 178.4375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.21490349248051643,
      "epoch": 0.1887447892542844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017751294653862715,
      "kl": 0.004497880348935723,
      "learning_rate": 9.622603056970818e-07,
      "loss": 0.0002,
      "num_tokens": 112124990.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4075,
      "step_time": 22.12447264790535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 138.0,
      "completions/mean_terminated_length": 138.0,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3145938962697983,
      "epoch": 0.1887911069939787,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038075167685747147,
      "kl": 0.002874632424209267,
      "learning_rate": 9.622510421491432e-07,
      "loss": 0.0001,
      "num_tokens": 112149118.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4076,
      "step_time": 16.077032446861267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 204.0625,
      "completions/mean_terminated_length": 204.0625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2810898497700691,
      "epoch": 0.188837424733673,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13672883808612823,
      "kl": 0.007074593333527446,
      "learning_rate": 9.622417786012043e-07,
      "loss": -0.0298,
      "num_tokens": 112173551.0,
      "reward": 0.4939180016517639,
      "reward_std": 0.13089695572853088,
      "rewards/reward_func/mean": 0.4939180016517639,
      "rewards/reward_func/std": 0.13089697062969208,
      "step": 4077,
      "step_time": 21.388612024486065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 126.5,
      "completions/mean_terminated_length": 126.5,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.23593541234731674,
      "epoch": 0.1888837424733673,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003772187978029251,
      "kl": 0.0024787528382148594,
      "learning_rate": 9.622325150532652e-07,
      "loss": 0.0001,
      "num_tokens": 112195031.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4078,
      "step_time": 14.221291285008192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 124.875,
      "completions/mean_terminated_length": 124.875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3156389445066452,
      "epoch": 0.1889300602130616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039777373895049095,
      "kl": 0.0022487479145638645,
      "learning_rate": 9.622232515053266e-07,
      "loss": 0.0001,
      "num_tokens": 112219701.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4079,
      "step_time": 14.497664973139763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 140.4375,
      "completions/mean_terminated_length": 140.4375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3001774325966835,
      "epoch": 0.1889763779527559,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003297751070931554,
      "kl": 0.002578421903308481,
      "learning_rate": 9.622139879573877e-07,
      "loss": 0.0001,
      "num_tokens": 112239852.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4080,
      "step_time": 14.37793630734086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 143.5,
      "completions/mean_terminated_length": 143.5,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.41828029602766037,
      "epoch": 0.1890226956924502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026004703249782324,
      "kl": 0.0020941461552865803,
      "learning_rate": 9.622047244094488e-07,
      "loss": 0.0001,
      "num_tokens": 112283508.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4081,
      "step_time": 20.340359319001436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 127.75,
      "completions/mean_terminated_length": 127.75,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.27900033444166183,
      "epoch": 0.18906901343214452,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013750758953392506,
      "kl": 0.003349784354213625,
      "learning_rate": 9.6219546086151e-07,
      "loss": 0.0002,
      "num_tokens": 112303728.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4082,
      "step_time": 13.775241889059544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 132.0625,
      "completions/mean_terminated_length": 132.0625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.31625083088874817,
      "epoch": 0.18911533117183882,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003438417799770832,
      "kl": 0.0021677478798665106,
      "learning_rate": 9.62186197313571e-07,
      "loss": 0.0001,
      "num_tokens": 112325777.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4083,
      "step_time": 14.63520834967494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.322076752781868,
      "epoch": 0.18916164891153311,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0041464813984930515,
      "kl": 0.0029935682541690767,
      "learning_rate": 9.621769337656322e-07,
      "loss": 0.0001,
      "num_tokens": 112347497.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4084,
      "step_time": 16.739662885665894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 159.125,
      "completions/mean_terminated_length": 159.125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.39617153257131577,
      "epoch": 0.1892079666512274,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022639597300440073,
      "kl": 0.0021434122463688254,
      "learning_rate": 9.621676702176933e-07,
      "loss": 0.0001,
      "num_tokens": 112378731.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4085,
      "step_time": 20.17177975550294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 154.6875,
      "completions/mean_terminated_length": 154.6875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.397089883685112,
      "epoch": 0.18925428439092173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002172194654121995,
      "kl": 0.0022505151864606887,
      "learning_rate": 9.621584066697544e-07,
      "loss": 0.0001,
      "num_tokens": 112425814.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4086,
      "step_time": 23.75847203284502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 173.0,
      "completions/mean_terminated_length": 173.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.23387274146080017,
      "epoch": 0.18930060213061603,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002618056023493409,
      "kl": 0.001825862971600145,
      "learning_rate": 9.621491431218156e-07,
      "loss": 0.0001,
      "num_tokens": 112455686.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 4087,
      "step_time": 19.35735733062029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 337.0,
      "completions/max_terminated_length": 337.0,
      "completions/mean_length": 226.5625,
      "completions/mean_terminated_length": 226.5625,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.4100160673260689,
      "epoch": 0.18934691987031033,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.01020955853164196,
      "kl": 0.010344581445679069,
      "learning_rate": 9.621398795738767e-07,
      "loss": 0.0004,
      "num_tokens": 112483375.0,
      "reward": 3.279789240195896e-08,
      "reward_std": 1.1687691170436665e-07,
      "rewards/reward_func/mean": 3.279789240195896e-08,
      "rewards/reward_func/std": 1.16876918809794e-07,
      "step": 4088,
      "step_time": 28.477491047233343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 220.0,
      "completions/mean_terminated_length": 220.0,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.1213931031525135,
      "epoch": 0.18939323761000462,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018979375017806888,
      "kl": 0.0014054529601708055,
      "learning_rate": 9.62130616025938e-07,
      "loss": 0.0001,
      "num_tokens": 112508159.0,
      "reward": 0.9534969329833984,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9534969329833984,
      "rewards/reward_func/std": 0.0,
      "step": 4089,
      "step_time": 21.313947524875402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 128.6875,
      "completions/mean_terminated_length": 128.6875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2810846120119095,
      "epoch": 0.18943955534969895,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004309963900595903,
      "kl": 0.0022224989079404622,
      "learning_rate": 9.621213524779991e-07,
      "loss": 0.0001,
      "num_tokens": 112527930.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4090,
      "step_time": 15.578890204429626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 142.8125,
      "completions/mean_terminated_length": 142.8125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.28713301569223404,
      "epoch": 0.18948587308939324,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034399654250591993,
      "kl": 0.0029686609632335603,
      "learning_rate": 9.6211208893006e-07,
      "loss": 0.0001,
      "num_tokens": 112552935.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4091,
      "step_time": 17.679511532187462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 262.0,
      "completions/max_terminated_length": 262.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.40622394531965256,
      "epoch": 0.18953219082908754,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006183540914207697,
      "kl": 0.0047001210623420775,
      "learning_rate": 9.621028253821214e-07,
      "loss": 0.0002,
      "num_tokens": 112598009.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4092,
      "step_time": 28.116632137447596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 127.5,
      "completions/mean_terminated_length": 127.5,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.28625255078077316,
      "epoch": 0.18957850856878183,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005459882784634829,
      "kl": 0.003465807647444308,
      "learning_rate": 9.620935618341825e-07,
      "loss": 0.0002,
      "num_tokens": 112627521.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4093,
      "step_time": 16.235010791569948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 282.8125,
      "completions/mean_terminated_length": 282.8125,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "entropy": 0.17093633115291595,
      "epoch": 0.18962482630847616,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08792279660701752,
      "kl": 0.016549956053495407,
      "learning_rate": 9.620842982862436e-07,
      "loss": 0.0133,
      "num_tokens": 112667758.0,
      "reward": 0.995914101600647,
      "reward_std": 0.016343481838703156,
      "rewards/reward_func/mean": 0.995914101600647,
      "rewards/reward_func/std": 0.016343489289283752,
      "step": 4094,
      "step_time": 31.010224632918835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 124.8125,
      "completions/mean_terminated_length": 124.8125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.29375743120908737,
      "epoch": 0.18967114404817045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002086227759718895,
      "kl": 0.0017451457388233393,
      "learning_rate": 9.620750347383048e-07,
      "loss": 0.0001,
      "num_tokens": 112689755.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4095,
      "step_time": 14.13269180059433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 139.5,
      "completions/mean_terminated_length": 139.5,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3268865644931793,
      "epoch": 0.18971746178786475,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006312190555036068,
      "kl": 0.0041411496931687,
      "learning_rate": 9.620657711903659e-07,
      "loss": 0.0002,
      "num_tokens": 112710003.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4096,
      "step_time": 15.974027272313833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 295.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 233.5,
      "completions/mean_terminated_length": 233.5,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "entropy": 0.2679096534848213,
      "epoch": 0.18976377952755905,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06756766885519028,
      "kl": 0.015096401330083609,
      "learning_rate": 9.62056507642427e-07,
      "loss": -0.0137,
      "num_tokens": 112732075.0,
      "reward": 0.7114852666854858,
      "reward_std": 0.18972940742969513,
      "rewards/reward_func/mean": 0.7114852666854858,
      "rewards/reward_func/std": 0.18972940742969513,
      "step": 4097,
      "step_time": 24.38409310951829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 161.1875,
      "completions/mean_terminated_length": 161.1875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.175911545753479,
      "epoch": 0.18981009726725337,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13000069558620453,
      "kl": 0.005401259404607117,
      "learning_rate": 9.620472440944881e-07,
      "loss": -0.0191,
      "num_tokens": 112753278.0,
      "reward": 0.848545253276825,
      "reward_std": 0.05034581571817398,
      "rewards/reward_func/mean": 0.848545253276825,
      "rewards/reward_func/std": 0.05034581571817398,
      "step": 4098,
      "step_time": 17.01097398623824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.42449715733528137,
      "epoch": 0.18985641500694767,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013729634694755077,
      "kl": 0.004774175118654966,
      "learning_rate": 9.620379805465493e-07,
      "loss": 0.0002,
      "num_tokens": 112787798.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4099,
      "step_time": 20.552163925021887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 142.4375,
      "completions/mean_terminated_length": 142.4375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.32483430951833725,
      "epoch": 0.18990273274664196,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006355844903737307,
      "kl": 0.0042109721107408404,
      "learning_rate": 9.620287169986104e-07,
      "loss": 0.0002,
      "num_tokens": 112815293.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4100,
      "step_time": 17.171034947037697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 168.0,
      "completions/mean_terminated_length": 168.0,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.2035413607954979,
      "epoch": 0.18994905048633626,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12430978566408157,
      "kl": 0.014210526598617435,
      "learning_rate": 9.620194534506715e-07,
      "loss": -0.018,
      "num_tokens": 112836461.0,
      "reward": 0.826245903968811,
      "reward_std": 0.12673979997634888,
      "rewards/reward_func/mean": 0.826245903968811,
      "rewards/reward_func/std": 0.12673981487751007,
      "step": 4101,
      "step_time": 18.095979381352663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 194.75,
      "completions/mean_terminated_length": 194.75,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.14771785028278828,
      "epoch": 0.18999536822603058,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008715835399925709,
      "kl": 0.0030156224966049194,
      "learning_rate": 9.620101899027328e-07,
      "loss": 0.0002,
      "num_tokens": 112861641.0,
      "reward": 0.6170787811279297,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6170787811279297,
      "rewards/reward_func/std": 0.0,
      "step": 4102,
      "step_time": 19.890064790844917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.26230965554714203,
      "epoch": 0.19004168596572488,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002729707397520542,
      "kl": 0.00267799012362957,
      "learning_rate": 9.620009263547938e-07,
      "loss": 0.0001,
      "num_tokens": 112883586.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4103,
      "step_time": 16.894199144095182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 135.3125,
      "completions/mean_terminated_length": 135.3125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2919178009033203,
      "epoch": 0.19008800370541917,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006839972920715809,
      "kl": 0.0037067380035296082,
      "learning_rate": 9.619916628068549e-07,
      "loss": 0.0002,
      "num_tokens": 112903751.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4104,
      "step_time": 15.149609547108412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 205.625,
      "completions/mean_terminated_length": 205.625,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.22336683422327042,
      "epoch": 0.19013432144511347,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12809725105762482,
      "kl": 0.015582123305648565,
      "learning_rate": 9.61982399258916e-07,
      "loss": 0.0231,
      "num_tokens": 112930417.0,
      "reward": 0.990056037902832,
      "reward_std": 0.039775896817445755,
      "rewards/reward_func/mean": 0.990056037902832,
      "rewards/reward_func/std": 0.03977589309215546,
      "step": 4105,
      "step_time": 22.927800353616476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 222.6875,
      "completions/mean_terminated_length": 222.6875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.40010541677474976,
      "epoch": 0.1901806391848078,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12244771420955658,
      "kl": 0.008408163907006383,
      "learning_rate": 9.619731357109773e-07,
      "loss": -0.0514,
      "num_tokens": 112958156.0,
      "reward": 0.35345709323883057,
      "reward_std": 0.471556156873703,
      "rewards/reward_func/mean": 0.35345709323883057,
      "rewards/reward_func/std": 0.4715561866760254,
      "step": 4106,
      "step_time": 23.678833052515984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 153.0625,
      "completions/mean_terminated_length": 153.0625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3967430666089058,
      "epoch": 0.1902269569245021,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036235363222658634,
      "kl": 0.003490384726319462,
      "learning_rate": 9.619638721630385e-07,
      "loss": 0.0002,
      "num_tokens": 112999309.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4107,
      "step_time": 21.72936211153865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 147.1875,
      "completions/mean_terminated_length": 147.1875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3907182961702347,
      "epoch": 0.19027327466419638,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001981359673663974,
      "kl": 0.0022990216966718435,
      "learning_rate": 9.619546086150996e-07,
      "loss": 0.0001,
      "num_tokens": 113033424.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4108,
      "step_time": 18.73133908584714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 135.5625,
      "completions/mean_terminated_length": 135.5625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.30377212166786194,
      "epoch": 0.19031959240389068,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027422490529716015,
      "kl": 0.0021515804110094905,
      "learning_rate": 9.619453450671607e-07,
      "loss": 0.0001,
      "num_tokens": 113055865.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4109,
      "step_time": 15.47657148167491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 168.875,
      "completions/mean_terminated_length": 168.875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.34115321189165115,
      "epoch": 0.190365910143585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005145085975527763,
      "kl": 0.00553589453920722,
      "learning_rate": 9.619360815192218e-07,
      "loss": 0.0003,
      "num_tokens": 113076935.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4110,
      "step_time": 18.9564984254539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 175.75,
      "completions/mean_terminated_length": 175.75,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.2507578805088997,
      "epoch": 0.1904122278832793,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12401070445775986,
      "kl": 0.007570909336209297,
      "learning_rate": 9.61926817971283e-07,
      "loss": 0.0585,
      "num_tokens": 113100355.0,
      "reward": 0.9143877029418945,
      "reward_std": 0.26070141792297363,
      "rewards/reward_func/mean": 0.9143877029418945,
      "rewards/reward_func/std": 0.26070141792297363,
      "step": 4111,
      "step_time": 20.16663908213377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 344.0,
      "completions/max_terminated_length": 344.0,
      "completions/mean_length": 211.5625,
      "completions/mean_terminated_length": 211.5625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3748941421508789,
      "epoch": 0.1904585456229736,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10845178365707397,
      "kl": 0.005902686039917171,
      "learning_rate": 9.61917554423344e-07,
      "loss": 0.2146,
      "num_tokens": 113135740.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4112,
      "step_time": 30.707610316574574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 126.5625,
      "completions/mean_terminated_length": 126.5625,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.3122868686914444,
      "epoch": 0.1905048633626679,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039450665935873985,
      "kl": 0.0023736665316391736,
      "learning_rate": 9.619082908754052e-07,
      "loss": 0.0001,
      "num_tokens": 113155733.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4113,
      "step_time": 14.587149430066347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 148.25,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.39277520775794983,
      "epoch": 0.19055118110236222,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009942091070115566,
      "kl": 0.005806333385407925,
      "learning_rate": 9.618990273274663e-07,
      "loss": 0.0003,
      "num_tokens": 113178409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4114,
      "step_time": 16.174455918371677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 164.0,
      "completions/mean_terminated_length": 164.0,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.4367034435272217,
      "epoch": 0.1905974988420565,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024834980722516775,
      "kl": 0.0026712362887337804,
      "learning_rate": 9.618897637795275e-07,
      "loss": 0.0001,
      "num_tokens": 113229433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4115,
      "step_time": 24.28413689136505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 136.5,
      "completions/mean_terminated_length": 136.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.258312463760376,
      "epoch": 0.1906438165817508,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010435893200337887,
      "kl": 0.0038344977074302733,
      "learning_rate": 9.618805002315886e-07,
      "loss": 0.0002,
      "num_tokens": 113249169.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4116,
      "step_time": 14.605042569339275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 188.3125,
      "completions/mean_terminated_length": 188.3125,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.42330311983823776,
      "epoch": 0.1906901343214451,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005170047283172607,
      "kl": 0.0034668792504817247,
      "learning_rate": 9.618712366836497e-07,
      "loss": 0.0002,
      "num_tokens": 113272758.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4117,
      "step_time": 19.21664920821786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 191.5,
      "completions/mean_terminated_length": 191.5,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.3497743457555771,
      "epoch": 0.19073645206113943,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1662413775920868,
      "kl": 0.007731353281997144,
      "learning_rate": 9.618619731357108e-07,
      "loss": -0.0099,
      "num_tokens": 113302878.0,
      "reward": 0.08951783180236816,
      "reward_std": 0.26566338539123535,
      "rewards/reward_func/mean": 0.08951783180236816,
      "rewards/reward_func/std": 0.26566338539123535,
      "step": 4118,
      "step_time": 21.004832059144974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 192.9375,
      "completions/mean_terminated_length": 192.9375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.17815063148736954,
      "epoch": 0.19078276980083372,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033755472395569086,
      "kl": 0.00312114623375237,
      "learning_rate": 9.618527095877722e-07,
      "loss": 0.0002,
      "num_tokens": 113333661.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4119,
      "step_time": 21.385315846651793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 142.875,
      "completions/mean_terminated_length": 142.875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.2912794053554535,
      "epoch": 0.19082908754052802,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002820141613483429,
      "kl": 0.002084189676679671,
      "learning_rate": 9.618434460398333e-07,
      "loss": 0.0001,
      "num_tokens": 113354523.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4120,
      "step_time": 15.178599156439304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.2421119175851345,
      "epoch": 0.19087540528022232,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09715329110622406,
      "kl": 0.005439578555524349,
      "learning_rate": 9.618341824918942e-07,
      "loss": -0.0392,
      "num_tokens": 113376237.0,
      "reward": 0.9850083589553833,
      "reward_std": 0.03223112225532532,
      "rewards/reward_func/mean": 0.9850083589553833,
      "rewards/reward_func/std": 0.03223112225532532,
      "step": 4121,
      "step_time": 20.041270956397057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 132.6875,
      "completions/mean_terminated_length": 132.6875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.272799015045166,
      "epoch": 0.19092172301991664,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033663539215922356,
      "kl": 0.0022227732406463474,
      "learning_rate": 9.618249189439556e-07,
      "loss": 0.0001,
      "num_tokens": 113396184.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4122,
      "step_time": 14.523907784372568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 167.5,
      "completions/mean_terminated_length": 167.5,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3335861638188362,
      "epoch": 0.19096804075961094,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039022567216306925,
      "kl": 0.0028716546948999166,
      "learning_rate": 9.618156553960167e-07,
      "loss": 0.0001,
      "num_tokens": 113421728.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4123,
      "step_time": 17.749713256955147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 132.8125,
      "completions/mean_terminated_length": 132.8125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3208964839577675,
      "epoch": 0.19101435849930523,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031338210683315992,
      "kl": 0.0026089075254276395,
      "learning_rate": 9.618063918480778e-07,
      "loss": 0.0001,
      "num_tokens": 113445645.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4124,
      "step_time": 15.515925850719213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 198.0625,
      "completions/mean_terminated_length": 198.0625,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.19702854380011559,
      "epoch": 0.19106067623899953,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011642944067716599,
      "kl": 0.006318412139080465,
      "learning_rate": 9.61797128300139e-07,
      "loss": 0.0003,
      "num_tokens": 113470190.0,
      "reward": 0.8539396524429321,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8539396524429321,
      "rewards/reward_func/std": 0.0,
      "step": 4125,
      "step_time": 19.687298599630594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 132.625,
      "completions/mean_terminated_length": 132.625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.28254781663417816,
      "epoch": 0.19110699397869385,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002491622930392623,
      "kl": 0.0019359943107701838,
      "learning_rate": 9.617878647522e-07,
      "loss": 0.0001,
      "num_tokens": 113502120.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4126,
      "step_time": 16.2068096883595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 149.0625,
      "completions/mean_terminated_length": 149.0625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.287677139043808,
      "epoch": 0.19115331171838815,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0059418524615466595,
      "kl": 0.00384613499045372,
      "learning_rate": 9.617786012042612e-07,
      "loss": 0.0002,
      "num_tokens": 113523689.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4127,
      "step_time": 16.24800293147564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 131.125,
      "completions/mean_terminated_length": 131.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.22621575370430946,
      "epoch": 0.19119962945808244,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00238971458747983,
      "kl": 0.0015752276813145727,
      "learning_rate": 9.617693376563223e-07,
      "loss": 0.0001,
      "num_tokens": 113543243.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4128,
      "step_time": 13.781833782792091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 186.75,
      "completions/mean_terminated_length": 186.75,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.2259707935154438,
      "epoch": 0.19124594719777674,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018358264351263642,
      "kl": 0.0019115104514639825,
      "learning_rate": 9.617600741083834e-07,
      "loss": 0.0001,
      "num_tokens": 113579783.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4129,
      "step_time": 21.70344466343522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 130.875,
      "completions/mean_terminated_length": 130.875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.31674399226903915,
      "epoch": 0.19129226493747106,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003532495116814971,
      "kl": 0.0019234635983593762,
      "learning_rate": 9.617508105604446e-07,
      "loss": 0.0001,
      "num_tokens": 113607029.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4130,
      "step_time": 16.637072067707777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 270.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 232.3125,
      "completions/mean_terminated_length": 232.3125,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.4538995549082756,
      "epoch": 0.19133858267716536,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09977710992097855,
      "kl": 0.003325658617541194,
      "learning_rate": 9.617415470125057e-07,
      "loss": 0.0237,
      "num_tokens": 113629210.0,
      "reward": 0.5625,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.5625,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 4131,
      "step_time": 22.828448496758938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 122.4375,
      "completions/mean_terminated_length": 122.4375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2979244291782379,
      "epoch": 0.19138490041685965,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024236678145825863,
      "kl": 0.0020474987395573407,
      "learning_rate": 9.61732283464567e-07,
      "loss": 0.0001,
      "num_tokens": 113657153.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4132,
      "step_time": 14.929260857403278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 145.9375,
      "completions/mean_terminated_length": 145.9375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.27596915513277054,
      "epoch": 0.19143121815655395,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016421765321865678,
      "kl": 0.0016378310974687338,
      "learning_rate": 9.617230199166281e-07,
      "loss": 0.0001,
      "num_tokens": 113681984.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4133,
      "step_time": 16.583049949258566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 204.625,
      "completions/mean_terminated_length": 204.625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.2172876000404358,
      "epoch": 0.19147753589624827,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010466446168720722,
      "kl": 0.009740041103214025,
      "learning_rate": 9.61713756368689e-07,
      "loss": 0.0005,
      "num_tokens": 113705290.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4134,
      "step_time": 21.961980622261763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 180.0625,
      "completions/mean_terminated_length": 180.0625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2684297263622284,
      "epoch": 0.19152385363594257,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15504348278045654,
      "kl": 0.008947949158027768,
      "learning_rate": 9.617044928207502e-07,
      "loss": -0.0027,
      "num_tokens": 113727163.0,
      "reward": 0.9928268790245056,
      "reward_std": 0.01960066333413124,
      "rewards/reward_func/mean": 0.9928268790245056,
      "rewards/reward_func/std": 0.019600657746195793,
      "step": 4135,
      "step_time": 17.34838716313243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 140.6875,
      "completions/mean_terminated_length": 140.6875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.20543736964464188,
      "epoch": 0.19157017137563687,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036165397614240646,
      "kl": 0.0022800464066676795,
      "learning_rate": 9.616952292728115e-07,
      "loss": 0.0001,
      "num_tokens": 113747798.0,
      "reward": 0.054531343281269073,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.054531343281269073,
      "rewards/reward_func/std": 0.0,
      "step": 4136,
      "step_time": 14.98941059038043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 176.0625,
      "completions/mean_terminated_length": 176.0625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.25562866404652596,
      "epoch": 0.19161648911533116,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11315275728702545,
      "kl": 0.009432476945221424,
      "learning_rate": 9.616859657248726e-07,
      "loss": 0.0674,
      "num_tokens": 113768807.0,
      "reward": 0.9258368611335754,
      "reward_std": 0.2512396574020386,
      "rewards/reward_func/mean": 0.9258368611335754,
      "rewards/reward_func/std": 0.25123968720436096,
      "step": 4137,
      "step_time": 19.531320482492447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 164.8125,
      "completions/mean_terminated_length": 164.8125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3511381074786186,
      "epoch": 0.19166280685502549,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030502998270094395,
      "kl": 0.0023590116179548204,
      "learning_rate": 9.616767021769338e-07,
      "loss": 0.0001,
      "num_tokens": 113795892.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4138,
      "step_time": 17.74069155752659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 198.625,
      "completions/mean_terminated_length": 198.625,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.48868875950574875,
      "epoch": 0.19170912459471978,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00716400658711791,
      "kl": 0.0056730881333351135,
      "learning_rate": 9.616674386289949e-07,
      "loss": 0.0003,
      "num_tokens": 113822110.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4139,
      "step_time": 22.50037330761552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 137.5625,
      "completions/mean_terminated_length": 137.5625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2536194510757923,
      "epoch": 0.19175544233441408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00179067172575742,
      "kl": 0.001369092016830109,
      "learning_rate": 9.61658175081056e-07,
      "loss": 0.0001,
      "num_tokens": 113841927.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4140,
      "step_time": 14.555555552244186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 535.0,
      "completions/max_terminated_length": 535.0,
      "completions/mean_length": 212.25,
      "completions/mean_terminated_length": 212.25,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.4374123215675354,
      "epoch": 0.19180176007410837,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14294247329235077,
      "kl": 0.0055836348328739405,
      "learning_rate": 9.616489115331171e-07,
      "loss": 0.3803,
      "num_tokens": 113870667.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4141,
      "step_time": 42.89665176346898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 158.625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.1771853119134903,
      "epoch": 0.1918480778138027,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2171192318201065,
      "kl": 0.03439937601797283,
      "learning_rate": 9.616396479851783e-07,
      "loss": -0.0669,
      "num_tokens": 113894549.0,
      "reward": 0.7944756746292114,
      "reward_std": 0.21226465702056885,
      "rewards/reward_func/mean": 0.7944756746292114,
      "rewards/reward_func/std": 0.21226465702056885,
      "step": 4142,
      "step_time": 16.87919330596924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 145.0625,
      "completions/mean_terminated_length": 145.0625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3588384613394737,
      "epoch": 0.191894395553497,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002160383854061365,
      "kl": 0.0017672441026661545,
      "learning_rate": 9.616303844372394e-07,
      "loss": 0.0001,
      "num_tokens": 113925398.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4143,
      "step_time": 18.057990729808807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.3224322274327278,
      "epoch": 0.1919407132931913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006190591957420111,
      "kl": 0.003072601044550538,
      "learning_rate": 9.616211208893005e-07,
      "loss": 0.0002,
      "num_tokens": 113954201.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4144,
      "step_time": 18.933008283376694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 177.6875,
      "completions/mean_terminated_length": 177.6875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.38990194350481033,
      "epoch": 0.19198703103288559,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00652702059596777,
      "kl": 0.0049262718530371785,
      "learning_rate": 9.616118573413616e-07,
      "loss": 0.0002,
      "num_tokens": 113977460.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4145,
      "step_time": 18.30564560368657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 137.1875,
      "completions/mean_terminated_length": 137.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.35952040553092957,
      "epoch": 0.1920333487725799,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004456476774066687,
      "kl": 0.0028655443456955254,
      "learning_rate": 9.616025937934228e-07,
      "loss": 0.0001,
      "num_tokens": 113999127.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4146,
      "step_time": 14.782212276011705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 192.0625,
      "completions/mean_terminated_length": 192.0625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.4055248573422432,
      "epoch": 0.1920796665122742,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007383132819086313,
      "kl": 0.007461455068551004,
      "learning_rate": 9.615933302454839e-07,
      "loss": 0.0004,
      "num_tokens": 114024008.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4147,
      "step_time": 25.052754264324903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 192.75,
      "completions/mean_terminated_length": 192.75,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.42571692913770676,
      "epoch": 0.1921259842519685,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15006126463413239,
      "kl": 0.0028612722526304424,
      "learning_rate": 9.61584066697545e-07,
      "loss": 0.0523,
      "num_tokens": 114052356.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4148,
      "step_time": 21.563019450753927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 152.25,
      "completions/mean_terminated_length": 152.25,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.432545930147171,
      "epoch": 0.1921723019916628,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019523289520293474,
      "kl": 0.002061988925561309,
      "learning_rate": 9.615748031496064e-07,
      "loss": 0.0001,
      "num_tokens": 114085864.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4149,
      "step_time": 18.404424782842398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 276.875,
      "completions/mean_terminated_length": 276.875,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "entropy": 0.22658958658576012,
      "epoch": 0.19221861973135712,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09118513762950897,
      "kl": 0.013732125982642174,
      "learning_rate": 9.615655396016675e-07,
      "loss": -0.0242,
      "num_tokens": 114111222.0,
      "reward": 0.9376300573348999,
      "reward_std": 0.03169326111674309,
      "rewards/reward_func/mean": 0.9376300573348999,
      "rewards/reward_func/std": 0.03169327229261398,
      "step": 4150,
      "step_time": 26.937728572636843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 161.0,
      "completions/mean_terminated_length": 161.0,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.14998751133680344,
      "epoch": 0.19226493747105142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7351125478744507,
      "kl": 0.016170429589692503,
      "learning_rate": 9.615562760537286e-07,
      "loss": -0.0087,
      "num_tokens": 114148294.0,
      "reward": 0.9211294651031494,
      "reward_std": 0.047029267996549606,
      "rewards/reward_func/mean": 0.9211294651031494,
      "rewards/reward_func/std": 0.0470292754471302,
      "step": 4151,
      "step_time": 20.600656140595675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 158.5625,
      "completions/mean_terminated_length": 158.5625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.42090681195259094,
      "epoch": 0.1923112552107457,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016604408156126738,
      "kl": 0.0020379158959258348,
      "learning_rate": 9.615470125057897e-07,
      "loss": 0.0001,
      "num_tokens": 114201711.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4152,
      "step_time": 26.212098207324743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 226.125,
      "completions/mean_terminated_length": 226.125,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.4798419699072838,
      "epoch": 0.19235757295044,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11509548127651215,
      "kl": 0.009450761834159493,
      "learning_rate": 9.615377489578509e-07,
      "loss": -0.0491,
      "num_tokens": 114236673.0,
      "reward": 0.05819142237305641,
      "reward_std": 0.23276568949222565,
      "rewards/reward_func/mean": 0.05819142237305641,
      "rewards/reward_func/std": 0.23276568949222565,
      "step": 4153,
      "step_time": 29.532480336725712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.36657992750406265,
      "epoch": 0.19240389069013433,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002306044800207019,
      "kl": 0.0024614148715045303,
      "learning_rate": 9.61528485409912e-07,
      "loss": 0.0001,
      "num_tokens": 114285531.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4154,
      "step_time": 22.670250222086906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 186.375,
      "completions/mean_terminated_length": 186.375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3475746810436249,
      "epoch": 0.19245020842982863,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11811861395835876,
      "kl": 0.0071674431674182415,
      "learning_rate": 9.61519221861973e-07,
      "loss": 0.024,
      "num_tokens": 114318017.0,
      "reward": 0.2126832902431488,
      "reward_std": 0.3804594576358795,
      "rewards/reward_func/mean": 0.2126832902431488,
      "rewards/reward_func/std": 0.38045942783355713,
      "step": 4155,
      "step_time": 21.09713003784418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 152.9375,
      "completions/mean_terminated_length": 152.9375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.4188668951392174,
      "epoch": 0.19249652616952292,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002057152334600687,
      "kl": 0.0024148281081579626,
      "learning_rate": 9.615099583140342e-07,
      "loss": 0.0001,
      "num_tokens": 114360896.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4156,
      "step_time": 21.31528852507472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 139.0,
      "completions/mean_terminated_length": 139.0,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.38907235115766525,
      "epoch": 0.19254284390921722,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020492763724178076,
      "kl": 0.0019699092663358897,
      "learning_rate": 9.615006947660954e-07,
      "loss": 0.0001,
      "num_tokens": 114384736.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4157,
      "step_time": 15.394862465560436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 199.625,
      "completions/mean_terminated_length": 199.625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.3164656236767769,
      "epoch": 0.19258916164891154,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12706026434898376,
      "kl": 0.02596709644421935,
      "learning_rate": 9.614914312181565e-07,
      "loss": -0.0196,
      "num_tokens": 114413466.0,
      "reward": 0.017014067620038986,
      "reward_std": 0.0137868020683527,
      "rewards/reward_func/mean": 0.017014067620038986,
      "rewards/reward_func/std": 0.013786802999675274,
      "step": 4158,
      "step_time": 20.807936184108257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 199.75,
      "completions/mean_terminated_length": 199.75,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.20087550207972527,
      "epoch": 0.19263547938860584,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11680769175291061,
      "kl": 0.03648154158145189,
      "learning_rate": 9.614821676702176e-07,
      "loss": -0.077,
      "num_tokens": 114436854.0,
      "reward": 0.4652743935585022,
      "reward_std": 0.25358015298843384,
      "rewards/reward_func/mean": 0.4652743935585022,
      "rewards/reward_func/std": 0.25358015298843384,
      "step": 4159,
      "step_time": 21.823095712810755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 127.75,
      "completions/mean_terminated_length": 127.75,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.1348789781332016,
      "epoch": 0.19268179712830014,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009309993125498295,
      "kl": 0.003608310245908797,
      "learning_rate": 9.614729041222787e-07,
      "loss": 0.0002,
      "num_tokens": 114462418.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4160,
      "step_time": 14.663566589355469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 159.375,
      "completions/mean_terminated_length": 159.375,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.21860400587320328,
      "epoch": 0.19272811486799443,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016577020287513733,
      "kl": 0.0014613009407185018,
      "learning_rate": 9.614636405743399e-07,
      "loss": 0.0001,
      "num_tokens": 114485816.0,
      "reward": 0.8824968934059143,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8824968934059143,
      "rewards/reward_func/std": 0.0,
      "step": 4161,
      "step_time": 16.999474443495274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 166.375,
      "completions/mean_terminated_length": 166.375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3817687928676605,
      "epoch": 0.19277443260768876,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002473505912348628,
      "kl": 0.0023684672778472304,
      "learning_rate": 9.614543770264012e-07,
      "loss": 0.0001,
      "num_tokens": 114518302.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4162,
      "step_time": 19.303426075726748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 165.9375,
      "completions/mean_terminated_length": 165.9375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.16365380585193634,
      "epoch": 0.19282075034738305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027192379347980022,
      "kl": 0.0019629605812951922,
      "learning_rate": 9.614451134784623e-07,
      "loss": 0.0001,
      "num_tokens": 114539453.0,
      "reward": 0.9487294554710388,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9487294554710388,
      "rewards/reward_func/std": 0.0,
      "step": 4163,
      "step_time": 17.175036642700434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 143.25,
      "completions/mean_terminated_length": 143.25,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.2042892538011074,
      "epoch": 0.19286706808707735,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005544089246541262,
      "kl": 0.0037651165621355176,
      "learning_rate": 9.614358499305234e-07,
      "loss": 0.0002,
      "num_tokens": 114560033.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 4164,
      "step_time": 16.043456874787807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 206.3125,
      "completions/mean_terminated_length": 206.3125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.43551913648843765,
      "epoch": 0.19291338582677164,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10597597807645798,
      "kl": 0.0061697757337242365,
      "learning_rate": 9.614265863825844e-07,
      "loss": 0.0702,
      "num_tokens": 114582182.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4165,
      "step_time": 21.919243324548006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 126.0,
      "completions/max_terminated_length": 126.0,
      "completions/mean_length": 107.1875,
      "completions/mean_terminated_length": 107.1875,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.2782635763287544,
      "epoch": 0.19295970356646597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033314658794552088,
      "kl": 0.0019797772401943803,
      "learning_rate": 9.614173228346457e-07,
      "loss": 0.0001,
      "num_tokens": 114609609.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4166,
      "step_time": 14.042929541319609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 123.75,
      "completions/mean_terminated_length": 123.75,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.28888339549303055,
      "epoch": 0.19300602130616026,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036419229581952095,
      "kl": 0.002689459885004908,
      "learning_rate": 9.614080592867068e-07,
      "loss": 0.0001,
      "num_tokens": 114632997.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4167,
      "step_time": 14.445013903081417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 186.8125,
      "completions/mean_terminated_length": 186.8125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.27170734480023384,
      "epoch": 0.19305233904585456,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005016711074858904,
      "kl": 0.001881208037957549,
      "learning_rate": 9.61398795738768e-07,
      "loss": 0.0001,
      "num_tokens": 114666418.0,
      "reward": 0.2177126258611679,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.2177126258611679,
      "rewards/reward_func/std": 0.0,
      "step": 4168,
      "step_time": 22.89176604896784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 139.6875,
      "completions/mean_terminated_length": 139.6875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3100237548351288,
      "epoch": 0.19309865678554886,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004565900191664696,
      "kl": 0.003418289590626955,
      "learning_rate": 9.61389532190829e-07,
      "loss": 0.0002,
      "num_tokens": 114689997.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4169,
      "step_time": 16.47387283295393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 150.5,
      "completions/mean_terminated_length": 150.5,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.21429673582315445,
      "epoch": 0.19314497452524318,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1330195516347885,
      "kl": 0.005152279802132398,
      "learning_rate": 9.613802686428902e-07,
      "loss": 0.0011,
      "num_tokens": 114713637.0,
      "reward": 0.9084429740905762,
      "reward_std": 0.01863100565969944,
      "rewards/reward_func/mean": 0.9084429740905762,
      "rewards/reward_func/std": 0.018631013110280037,
      "step": 4170,
      "step_time": 16.273397151380777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 187.25,
      "completions/mean_terminated_length": 187.25,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.21282903105020523,
      "epoch": 0.19319129226493748,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0058262222446501255,
      "kl": 0.005617423914372921,
      "learning_rate": 9.613710050949513e-07,
      "loss": 0.0003,
      "num_tokens": 114735305.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4171,
      "step_time": 18.354433950036764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 145.375,
      "completions/mean_terminated_length": 145.375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3839796110987663,
      "epoch": 0.19323761000463177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004510688595473766,
      "kl": 0.0033752802992239594,
      "learning_rate": 9.613617415470124e-07,
      "loss": 0.0002,
      "num_tokens": 114762687.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4172,
      "step_time": 17.608592182397842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 142.375,
      "completions/mean_terminated_length": 142.375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3697587624192238,
      "epoch": 0.19328392774432607,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048271669074893,
      "kl": 0.0035815382725559175,
      "learning_rate": 9.613524779990736e-07,
      "loss": 0.0002,
      "num_tokens": 114804485.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4173,
      "step_time": 21.474687982350588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 130.0625,
      "completions/mean_terminated_length": 130.0625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2657466493546963,
      "epoch": 0.1933302454840204,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13559861481189728,
      "kl": 0.009173538070172071,
      "learning_rate": 9.613432144511347e-07,
      "loss": 0.0667,
      "num_tokens": 114824566.0,
      "reward": 0.7353917360305786,
      "reward_std": 0.3687800467014313,
      "rewards/reward_func/mean": 0.7353917360305786,
      "rewards/reward_func/std": 0.3687800467014313,
      "step": 4174,
      "step_time": 15.048453759402037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 161.5,
      "completions/mean_terminated_length": 161.5,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.39067216217517853,
      "epoch": 0.1933765632237147,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0065057831816375256,
      "kl": 0.002822590176947415,
      "learning_rate": 9.613339509031958e-07,
      "loss": 0.0001,
      "num_tokens": 114875918.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4175,
      "step_time": 24.341829635202885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 154.25,
      "completions/mean_terminated_length": 154.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3952605575323105,
      "epoch": 0.19342288096340898,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010032407008111477,
      "kl": 0.003641855902969837,
      "learning_rate": 9.613246873552571e-07,
      "loss": 0.0002,
      "num_tokens": 114906914.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4176,
      "step_time": 19.441778726875782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 161.1875,
      "completions/mean_terminated_length": 161.1875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3579148128628731,
      "epoch": 0.19346919870310328,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004618246108293533,
      "kl": 0.003541071724612266,
      "learning_rate": 9.61315423807318e-07,
      "loss": 0.0002,
      "num_tokens": 114937877.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4177,
      "step_time": 18.79634879156947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 153.375,
      "completions/mean_terminated_length": 153.375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.1737445928156376,
      "epoch": 0.1935155164427976,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005803847219794989,
      "kl": 0.012925693765282631,
      "learning_rate": 9.613061602593792e-07,
      "loss": 0.0006,
      "num_tokens": 114959803.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 4178,
      "step_time": 18.71392446756363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 142.75,
      "completions/mean_terminated_length": 142.75,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.36541668325662613,
      "epoch": 0.1935618341824919,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004241726361215115,
      "kl": 0.0033100010768976063,
      "learning_rate": 9.612968967114405e-07,
      "loss": 0.0002,
      "num_tokens": 115011111.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4179,
      "step_time": 22.45142360776663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 173.5625,
      "completions/mean_terminated_length": 173.5625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4178507551550865,
      "epoch": 0.1936081519221862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032413292210549116,
      "kl": 0.0025839487207122147,
      "learning_rate": 9.612876331635016e-07,
      "loss": 0.0001,
      "num_tokens": 115051392.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4180,
      "step_time": 23.177177645266056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 136.4375,
      "completions/mean_terminated_length": 136.4375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.31026995927095413,
      "epoch": 0.1936544696618805,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030805582646280527,
      "kl": 0.001636517175938934,
      "learning_rate": 9.612783696155628e-07,
      "loss": 0.0001,
      "num_tokens": 115087575.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4181,
      "step_time": 18.440025456249714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 177.3125,
      "completions/mean_terminated_length": 177.3125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.20102904736995697,
      "epoch": 0.19370078740157481,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007652140222489834,
      "kl": 0.007242341234814376,
      "learning_rate": 9.61269106067624e-07,
      "loss": 0.0004,
      "num_tokens": 115122204.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 4182,
      "step_time": 22.110097400844097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 154.0625,
      "completions/mean_terminated_length": 154.0625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3495076298713684,
      "epoch": 0.1937471051412691,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021891407668590546,
      "kl": 0.0018741044332273304,
      "learning_rate": 9.61259842519685e-07,
      "loss": 0.0001,
      "num_tokens": 115151405.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4183,
      "step_time": 19.876628793776035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 201.625,
      "completions/mean_terminated_length": 201.625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.2147158570587635,
      "epoch": 0.1937934228809634,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004653643351048231,
      "kl": 0.0035977481165900826,
      "learning_rate": 9.612505789717461e-07,
      "loss": 0.0002,
      "num_tokens": 115175191.0,
      "reward": 0.7703813910484314,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7703813910484314,
      "rewards/reward_func/std": 0.0,
      "step": 4184,
      "step_time": 24.77579763531685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 209.0,
      "completions/mean_terminated_length": 209.0,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.18587301298975945,
      "epoch": 0.1938397406206577,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12032096832990646,
      "kl": 0.005851471039932221,
      "learning_rate": 9.612413154238073e-07,
      "loss": -0.0155,
      "num_tokens": 115208167.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4185,
      "step_time": 25.782169092446566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 158.5625,
      "completions/mean_terminated_length": 158.5625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.34699859470129013,
      "epoch": 0.19388605836035203,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022761430591344833,
      "kl": 0.0020523566636256874,
      "learning_rate": 9.612320518758684e-07,
      "loss": 0.0001,
      "num_tokens": 115240208.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4186,
      "step_time": 22.218475986272097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 210.0625,
      "completions/mean_terminated_length": 210.0625,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.2231697253882885,
      "epoch": 0.19393237610004632,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0070524620823562145,
      "kl": 0.007093827938660979,
      "learning_rate": 9.612227883279295e-07,
      "loss": 0.0004,
      "num_tokens": 115273697.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4187,
      "step_time": 25.761937137693167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 185.0625,
      "completions/mean_terminated_length": 185.0625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.23038675263524055,
      "epoch": 0.19397869383974062,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09096844494342804,
      "kl": 0.008536142529919744,
      "learning_rate": 9.612135247799906e-07,
      "loss": -0.0438,
      "num_tokens": 115296978.0,
      "reward": 0.9396798610687256,
      "reward_std": 0.023546535521745682,
      "rewards/reward_func/mean": 0.9396798610687256,
      "rewards/reward_func/std": 0.023546550422906876,
      "step": 4188,
      "step_time": 21.817990139126778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3383130058646202,
      "epoch": 0.19402501157943491,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037833014503121376,
      "kl": 0.0027758661308325827,
      "learning_rate": 9.612042612320518e-07,
      "loss": 0.0001,
      "num_tokens": 115318198.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4189,
      "step_time": 17.471931621432304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 146.5,
      "completions/mean_terminated_length": 146.5,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.34252479672431946,
      "epoch": 0.19407132931912924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029100715182721615,
      "kl": 0.0021770632301922888,
      "learning_rate": 9.61194997684113e-07,
      "loss": 0.0001,
      "num_tokens": 115344126.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4190,
      "step_time": 18.583404313772917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 151.3125,
      "completions/mean_terminated_length": 151.3125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3847458064556122,
      "epoch": 0.19411764705882353,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002347928937524557,
      "kl": 0.0023530853795818985,
      "learning_rate": 9.61185734136174e-07,
      "loss": 0.0001,
      "num_tokens": 115377299.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4191,
      "step_time": 21.23593070358038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 171.1875,
      "completions/mean_terminated_length": 171.1875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.19050484895706177,
      "epoch": 0.19416396479851783,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015054878313094378,
      "kl": 0.0011488858435768634,
      "learning_rate": 9.611764705882354e-07,
      "loss": 0.0001,
      "num_tokens": 115414390.0,
      "reward": 0.8751733303070068,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8751733303070068,
      "rewards/reward_func/std": 0.0,
      "step": 4192,
      "step_time": 24.56995451077819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 148.0625,
      "completions/mean_terminated_length": 148.0625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.4159550294280052,
      "epoch": 0.19421028253821213,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004171000327914953,
      "kl": 0.003203250002115965,
      "learning_rate": 9.611672070402965e-07,
      "loss": 0.0002,
      "num_tokens": 115458919.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4193,
      "step_time": 26.765640523284674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 125.625,
      "completions/mean_terminated_length": 125.625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.207529217004776,
      "epoch": 0.19425660027790645,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022517722100019455,
      "kl": 0.0016181544633582234,
      "learning_rate": 9.611579434923576e-07,
      "loss": 0.0001,
      "num_tokens": 115478353.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4194,
      "step_time": 15.274928357452154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 141.0,
      "completions/mean_terminated_length": 141.0,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.25281310826539993,
      "epoch": 0.19430291801760075,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034878733567893505,
      "kl": 0.002078938763588667,
      "learning_rate": 9.611486799444185e-07,
      "loss": 0.0001,
      "num_tokens": 115498305.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4195,
      "step_time": 16.672792583703995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 161.9375,
      "completions/mean_terminated_length": 161.9375,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.3926580920815468,
      "epoch": 0.19434923575729504,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006166216917335987,
      "kl": 0.004865115392021835,
      "learning_rate": 9.611394163964799e-07,
      "loss": 0.0002,
      "num_tokens": 115543744.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4196,
      "step_time": 25.46180647611618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 163.625,
      "completions/mean_terminated_length": 163.625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.1641295999288559,
      "epoch": 0.19439555349698934,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12148923426866531,
      "kl": 0.009986362012568861,
      "learning_rate": 9.61130152848541e-07,
      "loss": -0.034,
      "num_tokens": 115577514.0,
      "reward": 0.8585449457168579,
      "reward_std": 0.030507458373904228,
      "rewards/reward_func/mean": 0.8585449457168579,
      "rewards/reward_func/std": 0.030507460236549377,
      "step": 4197,
      "step_time": 22.401244588196278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 143.625,
      "completions/mean_terminated_length": 143.625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3376666083931923,
      "epoch": 0.19444187123668366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004313699901103973,
      "kl": 0.0029067276045680046,
      "learning_rate": 9.611208893006021e-07,
      "loss": 0.0001,
      "num_tokens": 115598996.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4198,
      "step_time": 19.078201115131378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 152.4375,
      "completions/mean_terminated_length": 152.4375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.38116656243801117,
      "epoch": 0.19448818897637796,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005778941325843334,
      "kl": 0.003494909673463553,
      "learning_rate": 9.611116257526632e-07,
      "loss": 0.0002,
      "num_tokens": 115628859.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4199,
      "step_time": 22.87818080559373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 182.9375,
      "completions/mean_terminated_length": 182.9375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.4122886210680008,
      "epoch": 0.19453450671607225,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040127490647137165,
      "kl": 0.0033474526717327535,
      "learning_rate": 9.611023622047244e-07,
      "loss": 0.0002,
      "num_tokens": 115654506.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4200,
      "step_time": 23.09018326923251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 136.0,
      "completions/mean_terminated_length": 136.0,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.317509301006794,
      "epoch": 0.19458082445576655,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00948141235858202,
      "kl": 0.005367578472942114,
      "learning_rate": 9.610930986567855e-07,
      "loss": 0.0003,
      "num_tokens": 115690554.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4201,
      "step_time": 20.8521859459579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 137.375,
      "completions/mean_terminated_length": 137.375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3536151200532913,
      "epoch": 0.19462714219546087,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003814997849985957,
      "kl": 0.003215965232811868,
      "learning_rate": 9.610838351088466e-07,
      "loss": 0.0002,
      "num_tokens": 115719008.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4202,
      "step_time": 18.768044739961624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.27821094542741776,
      "epoch": 0.19467345993515517,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003017124952748418,
      "kl": 0.0019369286019355059,
      "learning_rate": 9.610745715609077e-07,
      "loss": 0.0001,
      "num_tokens": 115740801.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4203,
      "step_time": 17.118156362324953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 165.125,
      "completions/mean_terminated_length": 165.125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.4297303408384323,
      "epoch": 0.19471977767484946,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002109951339662075,
      "kl": 0.002258738153614104,
      "learning_rate": 9.610653080129689e-07,
      "loss": 0.0001,
      "num_tokens": 115778179.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4204,
      "step_time": 24.89584843814373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 195.375,
      "completions/mean_terminated_length": 195.375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.2171323448419571,
      "epoch": 0.19476609541454376,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15486817061901093,
      "kl": 0.014342037728056312,
      "learning_rate": 9.6105604446503e-07,
      "loss": -0.0228,
      "num_tokens": 115815929.0,
      "reward": 0.9793950319290161,
      "reward_std": 0.0824199914932251,
      "rewards/reward_func/mean": 0.9793950319290161,
      "rewards/reward_func/std": 0.0824199914932251,
      "step": 4205,
      "step_time": 25.921521224081516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 159.875,
      "completions/mean_terminated_length": 159.875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.28121767938137054,
      "epoch": 0.19481241315423808,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16210217773914337,
      "kl": 0.03833826305344701,
      "learning_rate": 9.610467809170913e-07,
      "loss": 0.0126,
      "num_tokens": 115836679.0,
      "reward": 0.9559363126754761,
      "reward_std": 0.058751560747623444,
      "rewards/reward_func/mean": 0.9559363126754761,
      "rewards/reward_func/std": 0.058751557022333145,
      "step": 4206,
      "step_time": 19.4800363779068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 164.75,
      "completions/mean_terminated_length": 164.75,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.22877272963523865,
      "epoch": 0.19485873089393238,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13253124058246613,
      "kl": 0.005843870399985462,
      "learning_rate": 9.610375173691524e-07,
      "loss": -0.0154,
      "num_tokens": 115858179.0,
      "reward": 0.9439884424209595,
      "reward_std": 0.027789480984210968,
      "rewards/reward_func/mean": 0.9439884424209595,
      "rewards/reward_func/std": 0.027789490297436714,
      "step": 4207,
      "step_time": 18.124618284404278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 147.875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.12721232697367668,
      "epoch": 0.19490504863362668,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015336594078689814,
      "kl": 0.0009773392375791445,
      "learning_rate": 9.610282538212134e-07,
      "loss": 0.0,
      "num_tokens": 115892945.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 4208,
      "step_time": 20.8048697412014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 149.5,
      "completions/mean_terminated_length": 149.5,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.39501602947711945,
      "epoch": 0.19495136637332097,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002226000651717186,
      "kl": 0.0021614611614495516,
      "learning_rate": 9.610189902732747e-07,
      "loss": 0.0001,
      "num_tokens": 115925897.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4209,
      "step_time": 20.158426381647587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 431.0,
      "completions/max_terminated_length": 431.0,
      "completions/mean_length": 394.125,
      "completions/mean_terminated_length": 394.125,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "entropy": 0.2638545520603657,
      "epoch": 0.1949976841130153,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08238273113965988,
      "kl": 0.02052580565214157,
      "learning_rate": 9.610097267253358e-07,
      "loss": -0.0409,
      "num_tokens": 115961819.0,
      "reward": 0.899699330329895,
      "reward_std": 0.10436239093542099,
      "rewards/reward_func/mean": 0.899699330329895,
      "rewards/reward_func/std": 0.10436239093542099,
      "step": 4210,
      "step_time": 43.25924604386091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 152.75,
      "completions/mean_terminated_length": 152.75,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3825525864958763,
      "epoch": 0.1950440018527096,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006205675192177296,
      "kl": 0.004654404416214675,
      "learning_rate": 9.61000463177397e-07,
      "loss": 0.0002,
      "num_tokens": 115985607.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4211,
      "step_time": 18.63008676469326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 159.6875,
      "completions/mean_terminated_length": 159.6875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.155677430331707,
      "epoch": 0.1950903195924039,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002837250242009759,
      "kl": 0.09240571223199368,
      "learning_rate": 9.60991199629458e-07,
      "loss": 0.0046,
      "num_tokens": 116010354.0,
      "reward": 0.910879909992218,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.910879909992218,
      "rewards/reward_func/std": 0.0,
      "step": 4212,
      "step_time": 19.852250806987286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 210.0625,
      "completions/mean_terminated_length": 210.0625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.42255423218011856,
      "epoch": 0.19513663733209818,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005290188826620579,
      "kl": 0.00481342279817909,
      "learning_rate": 9.609819360815192e-07,
      "loss": 0.0002,
      "num_tokens": 116036403.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4213,
      "step_time": 32.70421166345477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 174.3125,
      "completions/mean_terminated_length": 174.3125,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.41341114789247513,
      "epoch": 0.1951829550717925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011044755578041077,
      "kl": 0.0070021499413996935,
      "learning_rate": 9.609726725335803e-07,
      "loss": 0.0003,
      "num_tokens": 116079592.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4214,
      "step_time": 26.656338464468718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 142.4375,
      "completions/mean_terminated_length": 142.4375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2703259363770485,
      "epoch": 0.1952292728114868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005582020152360201,
      "kl": 0.003675670886877924,
      "learning_rate": 9.609634089856414e-07,
      "loss": 0.0002,
      "num_tokens": 116099663.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4215,
      "step_time": 16.943227514624596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 159.5,
      "completions/mean_terminated_length": 159.5,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.42376527935266495,
      "epoch": 0.1952755905511811,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023706508800387383,
      "kl": 0.002180389012210071,
      "learning_rate": 9.609541454377026e-07,
      "loss": 0.0001,
      "num_tokens": 116133031.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4216,
      "step_time": 21.929262027144432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 140.0,
      "completions/mean_terminated_length": 140.0,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3579980581998825,
      "epoch": 0.1953219082908754,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021299226209521294,
      "kl": 0.0022214812925085425,
      "learning_rate": 9.609448818897637e-07,
      "loss": 0.0001,
      "num_tokens": 116163735.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4217,
      "step_time": 18.779218014329672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.2582762949168682,
      "epoch": 0.19536822603056972,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15609025955200195,
      "kl": 0.006726448307745159,
      "learning_rate": 9.609356183418248e-07,
      "loss": -0.0004,
      "num_tokens": 116186847.0,
      "reward": 0.994957685470581,
      "reward_std": 0.020169313997030258,
      "rewards/reward_func/mean": 0.994957685470581,
      "rewards/reward_func/std": 0.020169317722320557,
      "step": 4218,
      "step_time": 20.52297157049179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 176.5,
      "completions/mean_terminated_length": 176.5,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3304376155138016,
      "epoch": 0.19541454377026402,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1558787077665329,
      "kl": 0.01612040540203452,
      "learning_rate": 9.609263547938862e-07,
      "loss": -0.1094,
      "num_tokens": 116210231.0,
      "reward": 0.1659776270389557,
      "reward_std": 0.35684120655059814,
      "rewards/reward_func/mean": 0.1659776270389557,
      "rewards/reward_func/std": 0.35684117674827576,
      "step": 4219,
      "step_time": 22.43446246162057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 125.625,
      "completions/mean_terminated_length": 125.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2898036688566208,
      "epoch": 0.1954608615099583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023353155702352524,
      "kl": 0.0018903164600487798,
      "learning_rate": 9.60917091245947e-07,
      "loss": 0.0001,
      "num_tokens": 116236017.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4220,
      "step_time": 16.95613558217883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 190.3125,
      "completions/mean_terminated_length": 190.3125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.27350638061761856,
      "epoch": 0.1955071792496526,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1756594330072403,
      "kl": 0.021484515629708767,
      "learning_rate": 9.609078276980082e-07,
      "loss": 0.0274,
      "num_tokens": 116257894.0,
      "reward": 0.9054722785949707,
      "reward_std": 0.09762780368328094,
      "rewards/reward_func/mean": 0.9054722785949707,
      "rewards/reward_func/std": 0.09762781113386154,
      "step": 4221,
      "step_time": 24.026976376771927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 151.75,
      "completions/mean_terminated_length": 151.75,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.2922987937927246,
      "epoch": 0.19555349698934693,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031239285599440336,
      "kl": 0.002129494328983128,
      "learning_rate": 9.608985641500695e-07,
      "loss": 0.0001,
      "num_tokens": 116278498.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4222,
      "step_time": 18.22118454799056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 181.3125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4489883482456207,
      "epoch": 0.19559981472904123,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037734820507466793,
      "kl": 0.0030742106027901173,
      "learning_rate": 9.608893006021307e-07,
      "loss": 0.0002,
      "num_tokens": 116310247.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4223,
      "step_time": 22.699246127158403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 206.25,
      "completions/mean_terminated_length": 206.25,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.39858105778694153,
      "epoch": 0.19564613246873552,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007778745610266924,
      "kl": 0.006325821857899427,
      "learning_rate": 9.608800370541918e-07,
      "loss": 0.0003,
      "num_tokens": 116342379.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4224,
      "step_time": 29.138686653226614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 148.375,
      "completions/mean_terminated_length": 148.375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.22915543615818024,
      "epoch": 0.19569245020842982,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035018566995859146,
      "kl": 0.0028540154453366995,
      "learning_rate": 9.60870773506253e-07,
      "loss": 0.0001,
      "num_tokens": 116362977.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4225,
      "step_time": 17.080576337873936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 190.75,
      "completions/mean_terminated_length": 190.75,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.42701394855976105,
      "epoch": 0.19573876794812414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0068210274912416935,
      "kl": 0.00447841000277549,
      "learning_rate": 9.60861509958314e-07,
      "loss": 0.0002,
      "num_tokens": 116384749.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4226,
      "step_time": 23.173054572194815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 203.8125,
      "completions/mean_terminated_length": 203.8125,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.2344268225133419,
      "epoch": 0.19578508568781844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021351028699427843,
      "kl": 0.035033333115279675,
      "learning_rate": 9.608522464103752e-07,
      "loss": 0.0017,
      "num_tokens": 116416714.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4227,
      "step_time": 25.170080687850714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 377.0,
      "completions/max_terminated_length": 377.0,
      "completions/mean_length": 337.75,
      "completions/mean_terminated_length": 337.75,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "entropy": 0.17622502893209457,
      "epoch": 0.19583140342751273,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005085702054202557,
      "kl": 0.004294879618100822,
      "learning_rate": 9.608429828624363e-07,
      "loss": 0.0002,
      "num_tokens": 116442758.0,
      "reward": 0.7532761096954346,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7532761096954346,
      "rewards/reward_func/std": 0.0,
      "step": 4228,
      "step_time": 36.40155283361673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 132.125,
      "completions/mean_terminated_length": 132.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.294950395822525,
      "epoch": 0.19587772116720703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016333815874531865,
      "kl": 0.0014642233145423234,
      "learning_rate": 9.608337193144974e-07,
      "loss": 0.0001,
      "num_tokens": 116468312.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4229,
      "step_time": 18.435693010687828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 185.625,
      "completions/mean_terminated_length": 185.625,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.19461952149868011,
      "epoch": 0.19592403890690135,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005276711191982031,
      "kl": 0.0035938590299338102,
      "learning_rate": 9.608244557665585e-07,
      "loss": 0.0002,
      "num_tokens": 116489874.0,
      "reward": 0.780767560005188,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.780767560005188,
      "rewards/reward_func/std": 0.0,
      "step": 4230,
      "step_time": 21.15241439640522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 157.6875,
      "completions/mean_terminated_length": 157.6875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.43789688497781754,
      "epoch": 0.19597035664659565,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024460810236632824,
      "kl": 0.0021170018007978797,
      "learning_rate": 9.608151922186197e-07,
      "loss": 0.0001,
      "num_tokens": 116528797.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4231,
      "step_time": 25.725592702627182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 208.3125,
      "completions/mean_terminated_length": 208.3125,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.2611159197986126,
      "epoch": 0.19601667438628995,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08797170966863632,
      "kl": 0.004341409425251186,
      "learning_rate": 9.608059286706808e-07,
      "loss": 0.0201,
      "num_tokens": 116550722.0,
      "reward": 0.9966088533401489,
      "reward_std": 0.013564594089984894,
      "rewards/reward_func/mean": 0.9966088533401489,
      "rewards/reward_func/std": 0.013564602471888065,
      "step": 4232,
      "step_time": 24.211438823491335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 172.875,
      "completions/mean_terminated_length": 172.875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.20028812438249588,
      "epoch": 0.19606299212598424,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17509032785892487,
      "kl": 0.019681470876093954,
      "learning_rate": 9.60796665122742e-07,
      "loss": -0.0895,
      "num_tokens": 116575280.0,
      "reward": 0.4348171353340149,
      "reward_std": 0.27660179138183594,
      "rewards/reward_func/mean": 0.4348171353340149,
      "rewards/reward_func/std": 0.2766018211841583,
      "step": 4233,
      "step_time": 22.409042045474052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 165.3125,
      "completions/mean_terminated_length": 165.3125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.16007276996970177,
      "epoch": 0.19610930986567857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10615488141775131,
      "kl": 0.001744678505929187,
      "learning_rate": 9.60787401574803e-07,
      "loss": -0.0777,
      "num_tokens": 116626245.0,
      "reward": 0.8260819911956787,
      "reward_std": 0.036492254585027695,
      "rewards/reward_func/mean": 0.8260819911956787,
      "rewards/reward_func/std": 0.03649226203560829,
      "step": 4234,
      "step_time": 26.96291321888566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 192.125,
      "completions/mean_terminated_length": 192.125,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.35143470764160156,
      "epoch": 0.19615562760537286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002009526826441288,
      "kl": 0.0020950931939296424,
      "learning_rate": 9.607781380268642e-07,
      "loss": 0.0001,
      "num_tokens": 116655479.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4235,
      "step_time": 22.444360587745905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 181.0625,
      "completions/mean_terminated_length": 181.0625,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.36605076491832733,
      "epoch": 0.19620194534506716,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004338215105235577,
      "kl": 0.003199953935109079,
      "learning_rate": 9.607688744789255e-07,
      "loss": 0.0002,
      "num_tokens": 116697496.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4236,
      "step_time": 25.907401349395514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 120.875,
      "completions/mean_terminated_length": 120.875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2512933015823364,
      "epoch": 0.19624826308476145,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029526492580771446,
      "kl": 0.0016395335551351309,
      "learning_rate": 9.607596109309866e-07,
      "loss": 0.0001,
      "num_tokens": 116717126.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4237,
      "step_time": 16.43812559172511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 168.1875,
      "completions/mean_terminated_length": 168.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.1612314097583294,
      "epoch": 0.19629458082445578,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048973532393574715,
      "kl": 0.0034369818749837577,
      "learning_rate": 9.607503473830475e-07,
      "loss": 0.0002,
      "num_tokens": 116740425.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 4238,
      "step_time": 19.980030063539743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 162.5,
      "completions/mean_terminated_length": 162.5,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.2984234616160393,
      "epoch": 0.19634089856415007,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027296452317386866,
      "kl": 0.002165403391700238,
      "learning_rate": 9.607410838351089e-07,
      "loss": 0.0001,
      "num_tokens": 116762369.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4239,
      "step_time": 19.539735689759254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 188.3125,
      "completions/mean_terminated_length": 188.3125,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.4104792848229408,
      "epoch": 0.19638721630384437,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026210760697722435,
      "kl": 0.0024948460049927235,
      "learning_rate": 9.6073182028717e-07,
      "loss": 0.0001,
      "num_tokens": 116793350.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4240,
      "step_time": 25.698275484144688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 138.4375,
      "completions/mean_terminated_length": 138.4375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2984291911125183,
      "epoch": 0.19643353404353867,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014600768918171525,
      "kl": 0.0014500562101602554,
      "learning_rate": 9.607225567392311e-07,
      "loss": 0.0001,
      "num_tokens": 116815821.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4241,
      "step_time": 19.075826909393072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 172.625,
      "completions/mean_terminated_length": 172.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.4563741758465767,
      "epoch": 0.196479851783233,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007138571701943874,
      "kl": 0.0027323447866365314,
      "learning_rate": 9.607132931912922e-07,
      "loss": 0.0001,
      "num_tokens": 116863351.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4242,
      "step_time": 31.27183734625578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 191.75,
      "completions/mean_terminated_length": 191.75,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.47689762711524963,
      "epoch": 0.19652616952292729,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004615445155650377,
      "kl": 0.0036913889925926924,
      "learning_rate": 9.607040296433534e-07,
      "loss": 0.0002,
      "num_tokens": 116903811.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4243,
      "step_time": 29.8212310038507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 153.875,
      "completions/mean_terminated_length": 153.875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.22932784631848335,
      "epoch": 0.19657248726262158,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037516478914767504,
      "kl": 0.003196165431290865,
      "learning_rate": 9.606947660954145e-07,
      "loss": 0.0002,
      "num_tokens": 116924097.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4244,
      "step_time": 18.255730766803026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 121.8125,
      "completions/mean_terminated_length": 121.8125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.26331768184900284,
      "epoch": 0.19661880500231588,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004044357221573591,
      "kl": 0.002656788448803127,
      "learning_rate": 9.606855025474756e-07,
      "loss": 0.0001,
      "num_tokens": 116944446.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4245,
      "step_time": 15.24508398398757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 168.6875,
      "completions/mean_terminated_length": 168.6875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.4032922387123108,
      "epoch": 0.1966651227420102,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003165687434375286,
      "kl": 0.0030372024630196393,
      "learning_rate": 9.606762389995367e-07,
      "loss": 0.0002,
      "num_tokens": 117003001.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4246,
      "step_time": 33.07563906535506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 198.75,
      "completions/mean_terminated_length": 198.75,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.24546974152326584,
      "epoch": 0.1967114404817045,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10324181616306305,
      "kl": 0.020861016120761633,
      "learning_rate": 9.606669754515979e-07,
      "loss": 0.0337,
      "num_tokens": 117024453.0,
      "reward": 0.6265100240707397,
      "reward_std": 0.00829548854380846,
      "rewards/reward_func/mean": 0.6265100240707397,
      "rewards/reward_func/std": 0.008295491337776184,
      "step": 4247,
      "step_time": 22.54040576890111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 342.0,
      "completions/max_terminated_length": 342.0,
      "completions/mean_length": 228.3125,
      "completions/mean_terminated_length": 228.3125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.5325218811631203,
      "epoch": 0.1967577582213988,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12794294953346252,
      "kl": 0.007044766913168132,
      "learning_rate": 9.60657711903659e-07,
      "loss": 0.2161,
      "num_tokens": 117053866.0,
      "reward": 0.5625,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.5625,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 4248,
      "step_time": 32.69337200373411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 152.0625,
      "completions/mean_terminated_length": 152.0625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.40886230766773224,
      "epoch": 0.1968040759610931,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006556299049407244,
      "kl": 0.0021459850249812007,
      "learning_rate": 9.606484483557203e-07,
      "loss": 0.0001,
      "num_tokens": 117086427.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4249,
      "step_time": 21.64050517976284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 127.4375,
      "completions/mean_terminated_length": 127.4375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3320283144712448,
      "epoch": 0.1968503937007874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021526897326111794,
      "kl": 0.0018389831820968539,
      "learning_rate": 9.606391848077814e-07,
      "loss": 0.0001,
      "num_tokens": 117115074.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4250,
      "step_time": 19.892197255045176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 360.0,
      "completions/max_terminated_length": 360.0,
      "completions/mean_length": 232.6875,
      "completions/mean_terminated_length": 232.6875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.39874593913555145,
      "epoch": 0.1968967114404817,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10539249330759048,
      "kl": 0.004786168457940221,
      "learning_rate": 9.606299212598424e-07,
      "loss": 0.1641,
      "num_tokens": 117159405.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4251,
      "step_time": 38.98399826139212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 212.0625,
      "completions/mean_terminated_length": 212.0625,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.28354430943727493,
      "epoch": 0.196943029180176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16920839250087738,
      "kl": 0.013166029239073396,
      "learning_rate": 9.606206577119035e-07,
      "loss": -0.1126,
      "num_tokens": 117189742.0,
      "reward": 0.40036869049072266,
      "reward_std": 0.41349899768829346,
      "rewards/reward_func/mean": 0.40036869049072266,
      "rewards/reward_func/std": 0.41349902749061584,
      "step": 4252,
      "step_time": 28.75827705487609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 168.9375,
      "completions/mean_terminated_length": 168.9375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.38469357043504715,
      "epoch": 0.1969893469198703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.016849059611558914,
      "kl": 0.008995268843136728,
      "learning_rate": 9.606113941639648e-07,
      "loss": 0.0005,
      "num_tokens": 117226493.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4253,
      "step_time": 25.00868810713291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 193.8125,
      "completions/mean_terminated_length": 193.8125,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.2701209485530853,
      "epoch": 0.19703566465956462,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15225957334041595,
      "kl": 0.015052320901304483,
      "learning_rate": 9.60602130616026e-07,
      "loss": 0.0469,
      "num_tokens": 117250234.0,
      "reward": 0.4334152936935425,
      "reward_std": 0.1670493483543396,
      "rewards/reward_func/mean": 0.4334152936935425,
      "rewards/reward_func/std": 0.1670493334531784,
      "step": 4254,
      "step_time": 26.83351392298937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 191.875,
      "completions/mean_terminated_length": 191.875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.3544131889939308,
      "epoch": 0.19708198239925892,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018783751875162125,
      "kl": 0.0022542255464941263,
      "learning_rate": 9.60592867068087e-07,
      "loss": 0.0001,
      "num_tokens": 117283512.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4255,
      "step_time": 24.932211596518755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 211.8125,
      "completions/mean_terminated_length": 211.8125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.4790257215499878,
      "epoch": 0.19712830013895322,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10050687938928604,
      "kl": 0.008070754003711045,
      "learning_rate": 9.605836035201482e-07,
      "loss": 0.0672,
      "num_tokens": 117305877.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4256,
      "step_time": 25.3606780692935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 172.3125,
      "completions/mean_terminated_length": 172.3125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.40603653341531754,
      "epoch": 0.1971746178786475,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1433793604373932,
      "kl": 0.009749911143444479,
      "learning_rate": 9.605743399722093e-07,
      "loss": 0.0347,
      "num_tokens": 117326330.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4257,
      "step_time": 23.288886569440365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 165.125,
      "completions/mean_terminated_length": 165.125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.19553682953119278,
      "epoch": 0.19722093561834184,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002261777874082327,
      "kl": 0.0021992510301060975,
      "learning_rate": 9.605650764242704e-07,
      "loss": 0.0001,
      "num_tokens": 117351276.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4258,
      "step_time": 21.188062090426683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 182.625,
      "completions/mean_terminated_length": 182.625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.24807263538241386,
      "epoch": 0.19726725335803613,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14250698685646057,
      "kl": 0.029690278228372335,
      "learning_rate": 9.605558128763316e-07,
      "loss": -0.0069,
      "num_tokens": 117372566.0,
      "reward": 0.6371071338653564,
      "reward_std": 0.03512675315141678,
      "rewards/reward_func/mean": 0.6371071338653564,
      "rewards/reward_func/std": 0.03512675687670708,
      "step": 4259,
      "step_time": 20.61169083416462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 178.375,
      "completions/mean_terminated_length": 178.375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.20011616870760918,
      "epoch": 0.19731357109773043,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14215035736560822,
      "kl": 0.07767318189144135,
      "learning_rate": 9.605465493283927e-07,
      "loss": 0.0035,
      "num_tokens": 117393612.0,
      "reward": 0.752162516117096,
      "reward_std": 0.2807443141937256,
      "rewards/reward_func/mean": 0.752162516117096,
      "rewards/reward_func/std": 0.280744343996048,
      "step": 4260,
      "step_time": 19.663628932088614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 133.375,
      "completions/mean_terminated_length": 133.375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2357555367052555,
      "epoch": 0.19735988883742472,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012321810238063335,
      "kl": 0.005285892868414521,
      "learning_rate": 9.605372857804538e-07,
      "loss": 0.0003,
      "num_tokens": 117413778.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4261,
      "step_time": 18.2445274181664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.24125902354717255,
      "epoch": 0.19740620657711905,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004134069662541151,
      "kl": 0.04424409521743655,
      "learning_rate": 9.605280222325152e-07,
      "loss": 0.0022,
      "num_tokens": 117436196.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4262,
      "step_time": 20.667074255645275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 142.0,
      "completions/mean_terminated_length": 142.0,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.3567390739917755,
      "epoch": 0.19745252431681334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002030797302722931,
      "kl": 0.0020479909144341946,
      "learning_rate": 9.60518758684576e-07,
      "loss": 0.0001,
      "num_tokens": 117460948.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4263,
      "step_time": 18.66989303380251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 133.5,
      "completions/mean_terminated_length": 133.5,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.27444790303707123,
      "epoch": 0.19749884205650764,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030250735580921173,
      "kl": 0.002313581178896129,
      "learning_rate": 9.605094951366372e-07,
      "loss": 0.0001,
      "num_tokens": 117482748.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4264,
      "step_time": 17.1928793489933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 131.625,
      "completions/mean_terminated_length": 131.625,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.34081917256116867,
      "epoch": 0.19754515979620194,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003179185790941119,
      "kl": 0.0027597531443461776,
      "learning_rate": 9.605002315886983e-07,
      "loss": 0.0001,
      "num_tokens": 117504374.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4265,
      "step_time": 16.26287868246436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 173.5,
      "completions/mean_terminated_length": 173.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.20281190425157547,
      "epoch": 0.19759147753589626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012527067214250565,
      "kl": 0.019857921521179378,
      "learning_rate": 9.604909680407597e-07,
      "loss": 0.0009,
      "num_tokens": 117536926.0,
      "reward": 0.10889280587434769,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.10889280587434769,
      "rewards/reward_func/std": 0.0,
      "step": 4266,
      "step_time": 23.154936235398054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 144.8125,
      "completions/mean_terminated_length": 144.8125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.32744892686605453,
      "epoch": 0.19763779527559056,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013877227902412415,
      "kl": 0.007512084790505469,
      "learning_rate": 9.604817044928208e-07,
      "loss": 0.0004,
      "num_tokens": 117560475.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4267,
      "step_time": 19.869691032916307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 125.4375,
      "completions/mean_terminated_length": 125.4375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2384634166955948,
      "epoch": 0.19768411301528485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032265728805214167,
      "kl": 0.0018207980028819293,
      "learning_rate": 9.60472440944882e-07,
      "loss": 0.0001,
      "num_tokens": 117580322.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4268,
      "step_time": 15.346847128123045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 149.1875,
      "completions/mean_terminated_length": 149.1875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3133619949221611,
      "epoch": 0.19773043075497915,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008078614249825478,
      "kl": 0.004485111043322831,
      "learning_rate": 9.60463177396943e-07,
      "loss": 0.0002,
      "num_tokens": 117602469.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4269,
      "step_time": 18.841124154627323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 120.3125,
      "completions/mean_terminated_length": 120.3125,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.30543189495801926,
      "epoch": 0.19777674849467347,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002928100759163499,
      "kl": 0.0022029224201105535,
      "learning_rate": 9.604539138490042e-07,
      "loss": 0.0001,
      "num_tokens": 117637562.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4270,
      "step_time": 18.047686591744423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 120.6875,
      "completions/mean_terminated_length": 120.6875,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.27932991087436676,
      "epoch": 0.19782306623436777,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005019314121454954,
      "kl": 0.0028968447586521506,
      "learning_rate": 9.604446503010653e-07,
      "loss": 0.0001,
      "num_tokens": 117658005.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4271,
      "step_time": 14.7598297894001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 178.25,
      "completions/mean_terminated_length": 178.25,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.35574671626091003,
      "epoch": 0.19786938397406206,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18242698907852173,
      "kl": 0.018063213676214218,
      "learning_rate": 9.604353867531264e-07,
      "loss": -0.0296,
      "num_tokens": 117681561.0,
      "reward": 0.5846918821334839,
      "reward_std": 0.46775349974632263,
      "rewards/reward_func/mean": 0.5846918821334839,
      "rewards/reward_func/std": 0.46775349974632263,
      "step": 4272,
      "step_time": 22.201333187520504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 104.5625,
      "completions/mean_terminated_length": 104.5625,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "entropy": 0.2888680547475815,
      "epoch": 0.19791570171375636,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035976667422801256,
      "kl": 0.0022441393230110407,
      "learning_rate": 9.604261232051875e-07,
      "loss": 0.0001,
      "num_tokens": 117704594.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4273,
      "step_time": 14.309563618153334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 157.6875,
      "completions/mean_terminated_length": 157.6875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.1722608432173729,
      "epoch": 0.19796201945345068,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21106410026550293,
      "kl": 0.05202815495431423,
      "learning_rate": 9.604168596572487e-07,
      "loss": -0.0167,
      "num_tokens": 117732525.0,
      "reward": 0.9616204500198364,
      "reward_std": 0.06865544617176056,
      "rewards/reward_func/mean": 0.9616204500198364,
      "rewards/reward_func/std": 0.06865545362234116,
      "step": 4274,
      "step_time": 21.714784532785416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 234.3125,
      "completions/mean_terminated_length": 234.3125,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "entropy": 0.16529910638928413,
      "epoch": 0.19800833719314498,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026992466300725937,
      "kl": 0.002464011573465541,
      "learning_rate": 9.604075961093098e-07,
      "loss": 0.0001,
      "num_tokens": 117770098.0,
      "reward": 0.5576546788215637,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5576546788215637,
      "rewards/reward_func/std": 0.0,
      "step": 4275,
      "step_time": 29.23502964526415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 160.9375,
      "completions/mean_terminated_length": 160.9375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.28430066257715225,
      "epoch": 0.19805465493283927,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003889898071065545,
      "kl": 0.0031116321333684027,
      "learning_rate": 9.60398332561371e-07,
      "loss": 0.0002,
      "num_tokens": 117792001.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4276,
      "step_time": 21.740156807005405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 146.3125,
      "completions/mean_terminated_length": 146.3125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.25032076239585876,
      "epoch": 0.19810097267253357,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17101536691188812,
      "kl": 0.006241139606572688,
      "learning_rate": 9.60389069013432e-07,
      "loss": -0.1195,
      "num_tokens": 117814230.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4277,
      "step_time": 22.160698179155588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.4354545697569847,
      "epoch": 0.1981472904122279,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02668669819831848,
      "kl": 0.01329098385758698,
      "learning_rate": 9.603798054654932e-07,
      "loss": 0.0007,
      "num_tokens": 117840369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4278,
      "step_time": 25.36359355598688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 190.0625,
      "completions/mean_terminated_length": 190.0625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.3894534707069397,
      "epoch": 0.1981936081519222,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11786775290966034,
      "kl": 0.007994166575372219,
      "learning_rate": 9.603705419175545e-07,
      "loss": 0.0213,
      "num_tokens": 117863106.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4279,
      "step_time": 23.085601836442947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 125.9375,
      "completions/mean_terminated_length": 125.9375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2934921160340309,
      "epoch": 0.1982399258916165,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019177637295797467,
      "kl": 0.0017948686436284333,
      "learning_rate": 9.603612783696156e-07,
      "loss": 0.0001,
      "num_tokens": 117884369.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4280,
      "step_time": 15.263262741267681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 157.125,
      "completions/mean_terminated_length": 157.125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.1332587655633688,
      "epoch": 0.19828624363131078,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25884559750556946,
      "kl": 0.036092991940677166,
      "learning_rate": 9.603520148216765e-07,
      "loss": -0.0136,
      "num_tokens": 117904963.0,
      "reward": 0.9548678994178772,
      "reward_std": 0.12332441657781601,
      "rewards/reward_func/mean": 0.9548678994178772,
      "rewards/reward_func/std": 0.1233244240283966,
      "step": 4281,
      "step_time": 17.021088305860758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 201.5,
      "completions/mean_terminated_length": 201.5,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.27391989156603813,
      "epoch": 0.1983325613710051,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11594119668006897,
      "kl": 0.014407145557925105,
      "learning_rate": 9.603427512737377e-07,
      "loss": 0.0807,
      "num_tokens": 117935355.0,
      "reward": 0.8145408630371094,
      "reward_std": 0.24217469990253448,
      "rewards/reward_func/mean": 0.8145408630371094,
      "rewards/reward_func/std": 0.24217469990253448,
      "step": 4282,
      "step_time": 29.77666798233986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 114.8125,
      "completions/mean_terminated_length": 114.8125,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.3361153304576874,
      "epoch": 0.1983788791106994,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00720619922503829,
      "kl": 0.003871683613397181,
      "learning_rate": 9.60333487725799e-07,
      "loss": 0.0002,
      "num_tokens": 117959832.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4283,
      "step_time": 15.830325540155172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 142.75,
      "completions/mean_terminated_length": 142.75,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.17089350149035454,
      "epoch": 0.1984251968503937,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19309671223163605,
      "kl": 0.005295194743666798,
      "learning_rate": 9.603242241778601e-07,
      "loss": -0.0036,
      "num_tokens": 117980164.0,
      "reward": 0.9444707632064819,
      "reward_std": 0.03311120346188545,
      "rewards/reward_func/mean": 0.9444707632064819,
      "rewards/reward_func/std": 0.03311121463775635,
      "step": 4284,
      "step_time": 17.01628965139389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 179.3125,
      "completions/mean_terminated_length": 179.3125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.19935547560453415,
      "epoch": 0.198471514590088,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1068725511431694,
      "kl": 0.004089641530299559,
      "learning_rate": 9.603149606299212e-07,
      "loss": -0.0525,
      "num_tokens": 118008345.0,
      "reward": 0.8930783271789551,
      "reward_std": 0.041308917105197906,
      "rewards/reward_func/mean": 0.8930783271789551,
      "rewards/reward_func/std": 0.04130890965461731,
      "step": 4285,
      "step_time": 22.150147527456284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 187.875,
      "completions/mean_terminated_length": 187.875,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.3802003040909767,
      "epoch": 0.19851783232978232,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004996567498892546,
      "kl": 0.004184939316473901,
      "learning_rate": 9.603056970819824e-07,
      "loss": 0.0002,
      "num_tokens": 118042711.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4286,
      "step_time": 27.316251508891582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 150.75,
      "completions/mean_terminated_length": 150.75,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.22020679339766502,
      "epoch": 0.1985641500694766,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005396071821451187,
      "kl": 0.003406070638448,
      "learning_rate": 9.602964335340435e-07,
      "loss": 0.0002,
      "num_tokens": 118066323.0,
      "reward": 0.8307302594184875,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8307302594184875,
      "rewards/reward_func/std": 0.0,
      "step": 4287,
      "step_time": 19.11954002082348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 137.375,
      "completions/mean_terminated_length": 137.375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.39390766620635986,
      "epoch": 0.1986104678091709,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003806658321991563,
      "kl": 0.0030758902430534363,
      "learning_rate": 9.602871699861046e-07,
      "loss": 0.0002,
      "num_tokens": 118088249.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4288,
      "step_time": 19.577602609992027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 171.0,
      "completions/mean_terminated_length": 171.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.13845118507742882,
      "epoch": 0.1986567855488652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004882005508989096,
      "kl": 0.0030589258822146803,
      "learning_rate": 9.602779064381657e-07,
      "loss": 0.0002,
      "num_tokens": 118110521.0,
      "reward": 0.73319411277771,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.73319411277771,
      "rewards/reward_func/std": 0.0,
      "step": 4289,
      "step_time": 22.829368107020855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 140.625,
      "completions/mean_terminated_length": 140.625,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.42518218606710434,
      "epoch": 0.19870310328855953,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003994393162429333,
      "kl": 0.002797422173898667,
      "learning_rate": 9.602686428902269e-07,
      "loss": 0.0001,
      "num_tokens": 118133203.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4290,
      "step_time": 18.251724045723677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 174.6875,
      "completions/mean_terminated_length": 174.6875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2062804326415062,
      "epoch": 0.19874942102825383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008033553138375282,
      "kl": 0.006925027701072395,
      "learning_rate": 9.60259379342288e-07,
      "loss": 0.0003,
      "num_tokens": 118154910.0,
      "reward": 0.939104437828064,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.939104437828064,
      "rewards/reward_func/std": 0.0,
      "step": 4291,
      "step_time": 20.61775129288435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 437.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 233.75,
      "completions/mean_terminated_length": 233.75,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.5162595435976982,
      "epoch": 0.19879573876794812,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11483185738325119,
      "kl": 0.00691199628636241,
      "learning_rate": 9.602501157943493e-07,
      "loss": 0.2097,
      "num_tokens": 118179930.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4292,
      "step_time": 39.865657422691584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 177.5625,
      "completions/mean_terminated_length": 177.5625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.33295440673828125,
      "epoch": 0.19884205650764242,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012061888352036476,
      "kl": 0.01105513609945774,
      "learning_rate": 9.602408522464105e-07,
      "loss": 0.0006,
      "num_tokens": 118200195.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4293,
      "step_time": 22.23841020464897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 140.0,
      "completions/mean_terminated_length": 140.0,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.19111380353569984,
      "epoch": 0.19888837424733674,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004515485372394323,
      "kl": 0.0043804405722767115,
      "learning_rate": 9.602315886984714e-07,
      "loss": 0.0002,
      "num_tokens": 118233011.0,
      "reward": 0.7177659273147583,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7177659273147583,
      "rewards/reward_func/std": 0.0,
      "step": 4294,
      "step_time": 18.93245917931199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 177.5625,
      "completions/mean_terminated_length": 177.5625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.35970911383628845,
      "epoch": 0.19893469198703104,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026350398547947407,
      "kl": 0.0023667019268032163,
      "learning_rate": 9.602223251505325e-07,
      "loss": 0.0001,
      "num_tokens": 118262076.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4295,
      "step_time": 27.931419048458338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 192.375,
      "completions/mean_terminated_length": 192.375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.24056771770119667,
      "epoch": 0.19898100972672533,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006367878057062626,
      "kl": 0.005521641462109983,
      "learning_rate": 9.602130616025938e-07,
      "loss": 0.0003,
      "num_tokens": 118286562.0,
      "reward": 0.558035135269165,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.558035135269165,
      "rewards/reward_func/std": 0.0,
      "step": 4296,
      "step_time": 21.315611243247986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 148.3125,
      "completions/mean_terminated_length": 148.3125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.261702336370945,
      "epoch": 0.19902732746641963,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036970695946365595,
      "kl": 0.0020516463846433908,
      "learning_rate": 9.60203798054655e-07,
      "loss": 0.0001,
      "num_tokens": 118310279.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4297,
      "step_time": 20.674363385885954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 177.375,
      "completions/mean_terminated_length": 177.375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4102034866809845,
      "epoch": 0.19907364520611395,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024717217311263084,
      "kl": 0.002613342134281993,
      "learning_rate": 9.60194534506716e-07,
      "loss": 0.0001,
      "num_tokens": 118337693.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4298,
      "step_time": 22.854691732674837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 161.375,
      "completions/mean_terminated_length": 161.375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3602997586131096,
      "epoch": 0.19911996294580825,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002583717694506049,
      "kl": 0.0021408379543572664,
      "learning_rate": 9.601852709587772e-07,
      "loss": 0.0001,
      "num_tokens": 118392483.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4299,
      "step_time": 27.734892047941685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 145.625,
      "completions/mean_terminated_length": 145.625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3382541611790657,
      "epoch": 0.19916628068550254,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007224246393889189,
      "kl": 0.004065761575475335,
      "learning_rate": 9.601760074108383e-07,
      "loss": 0.0002,
      "num_tokens": 118414717.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4300,
      "step_time": 19.81400351598859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 219.75,
      "completions/mean_terminated_length": 219.75,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.16925596073269844,
      "epoch": 0.19921259842519684,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036267966497689486,
      "kl": 0.02466302504763007,
      "learning_rate": 9.601667438628995e-07,
      "loss": 0.0012,
      "num_tokens": 118438681.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4301,
      "step_time": 25.158739805221558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 150.0,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.27261241525411606,
      "epoch": 0.19925891616489116,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013136201538145542,
      "kl": 0.008726644096896052,
      "learning_rate": 9.601574803149606e-07,
      "loss": 0.0004,
      "num_tokens": 118460665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4302,
      "step_time": 18.342788469046354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 117.0625,
      "completions/mean_terminated_length": 117.0625,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2522359751164913,
      "epoch": 0.19930523390458546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007597236428409815,
      "kl": 0.003091784135904163,
      "learning_rate": 9.601482167670217e-07,
      "loss": 0.0002,
      "num_tokens": 118480314.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4303,
      "step_time": 14.608827654272318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 134.8125,
      "completions/mean_terminated_length": 134.8125,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.17270654812455177,
      "epoch": 0.19935155164427976,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020094928331673145,
      "kl": 0.0017358693294227123,
      "learning_rate": 9.601389532190828e-07,
      "loss": 0.0001,
      "num_tokens": 118511879.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 4304,
      "step_time": 20.30519315227866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 137.75,
      "completions/mean_terminated_length": 137.75,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.2364879846572876,
      "epoch": 0.19939786938397405,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003375881351530552,
      "kl": 0.0024673426523804665,
      "learning_rate": 9.60129689671144e-07,
      "loss": 0.0001,
      "num_tokens": 118531699.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4305,
      "step_time": 15.670146342366934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 378.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 354.125,
      "completions/mean_terminated_length": 354.125,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "entropy": 0.25742775574326515,
      "epoch": 0.19944418712366838,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14799560606479645,
      "kl": 0.026925077196210623,
      "learning_rate": 9.60120426123205e-07,
      "loss": 0.0024,
      "num_tokens": 118561205.0,
      "reward": 0.9811591506004333,
      "reward_std": 0.013905213214457035,
      "rewards/reward_func/mean": 0.9811591506004333,
      "rewards/reward_func/std": 0.013905220665037632,
      "step": 4306,
      "step_time": 36.99573115259409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 186.1875,
      "completions/mean_terminated_length": 186.1875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.17282568290829659,
      "epoch": 0.19949050486336267,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029926139395684004,
      "kl": 0.004836944048292935,
      "learning_rate": 9.601111625752662e-07,
      "loss": 0.0002,
      "num_tokens": 118584632.0,
      "reward": 0.8503032922744751,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8503032922744751,
      "rewards/reward_func/std": 0.0,
      "step": 4307,
      "step_time": 22.417905122041702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 192.5625,
      "completions/mean_terminated_length": 192.5625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.38263826817274094,
      "epoch": 0.19953682260305697,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0086146779358387,
      "kl": 0.005638763657771051,
      "learning_rate": 9.601018990273273e-07,
      "loss": 0.0003,
      "num_tokens": 118608785.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4308,
      "step_time": 23.658910185098648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 163.375,
      "completions/mean_terminated_length": 163.375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.2452923022210598,
      "epoch": 0.19958314034275126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016304061282426119,
      "kl": 0.001445953967049718,
      "learning_rate": 9.600926354793887e-07,
      "loss": 0.0001,
      "num_tokens": 118636439.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 4309,
      "step_time": 21.38775284215808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 371.0,
      "completions/max_terminated_length": 371.0,
      "completions/mean_length": 223.25,
      "completions/mean_terminated_length": 223.25,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.45355185866355896,
      "epoch": 0.1996294580824456,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008887724950909615,
      "kl": 0.00968296267092228,
      "learning_rate": 9.600833719314498e-07,
      "loss": 0.0003,
      "num_tokens": 118677355.0,
      "reward": 7.646224275958957e-08,
      "reward_std": 1.4227086353457707e-07,
      "rewards/reward_func/mean": 7.646224275958957e-08,
      "rewards/reward_func/std": 1.422708777454318e-07,
      "step": 4310,
      "step_time": 40.341128807514906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 176.9375,
      "completions/mean_terminated_length": 176.9375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.16954953595995903,
      "epoch": 0.19967577582213988,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003977600950747728,
      "kl": 0.004344969056546688,
      "learning_rate": 9.60074108383511e-07,
      "loss": 0.0002,
      "num_tokens": 118729914.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4311,
      "step_time": 27.7505673058331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 174.375,
      "completions/mean_terminated_length": 174.375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.1930382288992405,
      "epoch": 0.19972209356183418,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1841389536857605,
      "kl": 0.02196095557883382,
      "learning_rate": 9.600648448355718e-07,
      "loss": 0.0097,
      "num_tokens": 118752560.0,
      "reward": 0.4496094584465027,
      "reward_std": 0.026650357991456985,
      "rewards/reward_func/mean": 0.4496094584465027,
      "rewards/reward_func/std": 0.026650357991456985,
      "step": 4312,
      "step_time": 20.353506673127413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 291.0,
      "completions/max_terminated_length": 291.0,
      "completions/mean_length": 236.1875,
      "completions/mean_terminated_length": 236.1875,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.3484003394842148,
      "epoch": 0.19976841130152848,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09551838785409927,
      "kl": 0.02796410722658038,
      "learning_rate": 9.600555812876332e-07,
      "loss": -0.061,
      "num_tokens": 118785635.0,
      "reward": 0.4152009189128876,
      "reward_std": 0.47523757815361023,
      "rewards/reward_func/mean": 0.4152009189128876,
      "rewards/reward_func/std": 0.47523754835128784,
      "step": 4313,
      "step_time": 30.679507791996002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 177.4375,
      "completions/mean_terminated_length": 177.4375,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.14813685044646263,
      "epoch": 0.1998147290412228,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001104956492781639,
      "kl": 0.0009701547678560019,
      "learning_rate": 9.600463177396943e-07,
      "loss": 0.0,
      "num_tokens": 118814586.0,
      "reward": 0.8507331609725952,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8507331609725952,
      "rewards/reward_func/std": 0.0,
      "step": 4314,
      "step_time": 21.842204809188843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 119.6875,
      "completions/mean_terminated_length": 119.6875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2730370759963989,
      "epoch": 0.1998610467809171,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00297934515401721,
      "kl": 0.0021234832529444247,
      "learning_rate": 9.600370541917554e-07,
      "loss": 0.0001,
      "num_tokens": 118835605.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4315,
      "step_time": 14.282845091074705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 126.0625,
      "completions/mean_terminated_length": 126.0625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3452801778912544,
      "epoch": 0.1999073645206114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00466137146577239,
      "kl": 0.0026993047795258462,
      "learning_rate": 9.600277906438165e-07,
      "loss": 0.0001,
      "num_tokens": 118856294.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4316,
      "step_time": 16.659909810870886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 279.0,
      "completions/max_terminated_length": 279.0,
      "completions/mean_length": 220.3125,
      "completions/mean_terminated_length": 220.3125,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "entropy": 0.4068012908101082,
      "epoch": 0.1999536822603057,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10009792447090149,
      "kl": 0.009737064130604267,
      "learning_rate": 9.600185270958777e-07,
      "loss": -0.0476,
      "num_tokens": 118883243.0,
      "reward": 0.39948156476020813,
      "reward_std": 0.46782490611076355,
      "rewards/reward_func/mean": 0.39948156476020813,
      "rewards/reward_func/std": 0.46782493591308594,
      "step": 4317,
      "step_time": 27.1250514164567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 169.6875,
      "completions/mean_terminated_length": 169.6875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3841549754142761,
      "epoch": 0.2,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15948259830474854,
      "kl": 0.012854799628257751,
      "learning_rate": 9.600092635479388e-07,
      "loss": 0.0265,
      "num_tokens": 118906550.0,
      "reward": 0.018789635971188545,
      "reward_std": 0.05163368210196495,
      "rewards/reward_func/mean": 0.018789635971188545,
      "rewards/reward_func/std": 0.05163368210196495,
      "step": 4318,
      "step_time": 20.691959884017706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 142.3125,
      "completions/mean_terminated_length": 142.3125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.33237239718437195,
      "epoch": 0.2000463177396943,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004691860172897577,
      "kl": 0.002636209363117814,
      "learning_rate": 9.6e-07,
      "loss": 0.0001,
      "num_tokens": 118933531.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4319,
      "step_time": 18.774765387177467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 193.125,
      "completions/mean_terminated_length": 193.125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.18039793521165848,
      "epoch": 0.2000926354793886,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11167694628238678,
      "kl": 0.033629335928708315,
      "learning_rate": 9.59990736452061e-07,
      "loss": -0.0231,
      "num_tokens": 118955085.0,
      "reward": 0.7709895968437195,
      "reward_std": 0.19339685142040253,
      "rewards/reward_func/mean": 0.7709895968437195,
      "rewards/reward_func/std": 0.19339686632156372,
      "step": 4320,
      "step_time": 23.29052422195673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 174.0625,
      "completions/mean_terminated_length": 174.0625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.42348533123731613,
      "epoch": 0.2001389532190829,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005849068984389305,
      "kl": 0.004332504118792713,
      "learning_rate": 9.599814729041222e-07,
      "loss": 0.0002,
      "num_tokens": 118990238.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4321,
      "step_time": 25.806898567825556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 189.6875,
      "completions/mean_terminated_length": 189.6875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.18591520190238953,
      "epoch": 0.20018527095877722,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004633432254195213,
      "kl": 0.009485263377428055,
      "learning_rate": 9.599722093561833e-07,
      "loss": 0.0005,
      "num_tokens": 119023737.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4322,
      "step_time": 24.49805849790573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 122.5625,
      "completions/mean_terminated_length": 122.5625,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2583474703133106,
      "epoch": 0.20023158869847152,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035095608327537775,
      "kl": 0.0019000690372195095,
      "learning_rate": 9.599629458082446e-07,
      "loss": 0.0001,
      "num_tokens": 119043394.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4323,
      "step_time": 16.022602926939726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 137.3125,
      "completions/mean_terminated_length": 137.3125,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.35340263694524765,
      "epoch": 0.20027790643816581,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029389699921011925,
      "kl": 0.001835564646171406,
      "learning_rate": 9.599536822603058e-07,
      "loss": 0.0001,
      "num_tokens": 119079287.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4324,
      "step_time": 20.98586842417717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 158.0625,
      "completions/mean_terminated_length": 158.0625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.19614377617835999,
      "epoch": 0.2003242241778601,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17011243104934692,
      "kl": 0.007219140185043216,
      "learning_rate": 9.599444187123667e-07,
      "loss": 0.0188,
      "num_tokens": 119103032.0,
      "reward": 0.8365928530693054,
      "reward_std": 0.11378215998411179,
      "rewards/reward_func/mean": 0.8365928530693054,
      "rewards/reward_func/std": 0.11378216743469238,
      "step": 4325,
      "step_time": 20.398106019943953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 120.875,
      "completions/mean_terminated_length": 120.875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.32871055603027344,
      "epoch": 0.20037054191755443,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004803883843123913,
      "kl": 0.0029782846686430275,
      "learning_rate": 9.59935155164428e-07,
      "loss": 0.0001,
      "num_tokens": 119123830.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4326,
      "step_time": 15.204549200832844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 188.5,
      "completions/mean_terminated_length": 188.5,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.3127734512090683,
      "epoch": 0.20041685965724873,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012301845476031303,
      "kl": 0.014960448257625103,
      "learning_rate": 9.599258916164891e-07,
      "loss": 0.0008,
      "num_tokens": 119147470.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4327,
      "step_time": 21.793963704258204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 148.375,
      "completions/mean_terminated_length": 148.375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.34366950392723083,
      "epoch": 0.20046317739694303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0045435307547450066,
      "kl": 0.002444214071147144,
      "learning_rate": 9.599166280685502e-07,
      "loss": 0.0001,
      "num_tokens": 119174916.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4328,
      "step_time": 20.995104853063822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 188.5,
      "completions/mean_terminated_length": 188.5,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.15307361632585526,
      "epoch": 0.20050949513663732,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15147282183170319,
      "kl": 0.0153071487438865,
      "learning_rate": 9.599073645206114e-07,
      "loss": -0.0684,
      "num_tokens": 119199324.0,
      "reward": 0.7627211809158325,
      "reward_std": 0.14264582097530365,
      "rewards/reward_func/mean": 0.7627211809158325,
      "rewards/reward_func/std": 0.14264583587646484,
      "step": 4329,
      "step_time": 21.754540774971247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 284.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 252.125,
      "completions/mean_terminated_length": 252.125,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "entropy": 0.2038777731359005,
      "epoch": 0.20055581287633165,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004737113602459431,
      "kl": 0.004034957790281624,
      "learning_rate": 9.598981009726725e-07,
      "loss": 0.0002,
      "num_tokens": 119223262.0,
      "reward": 0.8434853553771973,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8434853553771973,
      "rewards/reward_func/std": 0.0,
      "step": 4330,
      "step_time": 27.322276193648577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 239.25,
      "completions/mean_terminated_length": 239.25,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "entropy": 0.2365478202700615,
      "epoch": 0.20060213061602594,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002932611620053649,
      "kl": 0.0027366644062567502,
      "learning_rate": 9.598888374247336e-07,
      "loss": 0.0001,
      "num_tokens": 119250018.0,
      "reward": 0.7398260831832886,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7398260831832886,
      "rewards/reward_func/std": 0.0,
      "step": 4331,
      "step_time": 28.320982787758112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 175.1875,
      "completions/mean_terminated_length": 175.1875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3801484704017639,
      "epoch": 0.20064844835572024,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1591929942369461,
      "kl": 0.0030830000760033727,
      "learning_rate": 9.598795738767947e-07,
      "loss": -0.1124,
      "num_tokens": 119276725.0,
      "reward": 0.11765605211257935,
      "reward_std": 0.3222125172615051,
      "rewards/reward_func/mean": 0.11765605211257935,
      "rewards/reward_func/std": 0.3222125172615051,
      "step": 4332,
      "step_time": 24.597198083996773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 152.9375,
      "completions/mean_terminated_length": 152.9375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2738218382000923,
      "epoch": 0.20069476609541453,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012202922254800797,
      "kl": 0.0170272181276232,
      "learning_rate": 9.598703103288559e-07,
      "loss": 0.0009,
      "num_tokens": 119297284.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 4333,
      "step_time": 19.582966059446335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 202.125,
      "completions/mean_terminated_length": 202.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.23936771973967552,
      "epoch": 0.20074108383510886,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10019609332084656,
      "kl": 0.0017584140587132424,
      "learning_rate": 9.59861046780917e-07,
      "loss": -0.0591,
      "num_tokens": 119330070.0,
      "reward": 0.4562724530696869,
      "reward_std": 0.49524882435798645,
      "rewards/reward_func/mean": 0.4562724530696869,
      "rewards/reward_func/std": 0.49524882435798645,
      "step": 4334,
      "step_time": 25.87820515036583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 222.375,
      "completions/mean_terminated_length": 222.375,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.256301824003458,
      "epoch": 0.20078740157480315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033000006806105375,
      "kl": 0.002873494871892035,
      "learning_rate": 9.598517832329781e-07,
      "loss": 0.0001,
      "num_tokens": 119356956.0,
      "reward": 0.3219582736492157,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3219582736492157,
      "rewards/reward_func/std": 0.0,
      "step": 4335,
      "step_time": 26.77712243050337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 334.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 259.625,
      "completions/mean_terminated_length": 259.625,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.2784839943051338,
      "epoch": 0.20083371931449745,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06832694262266159,
      "kl": 0.02920991904102266,
      "learning_rate": 9.598425196850395e-07,
      "loss": 0.0009,
      "num_tokens": 119379798.0,
      "reward": 0.944659948348999,
      "reward_std": 0.22136029601097107,
      "rewards/reward_func/mean": 0.944659948348999,
      "rewards/reward_func/std": 0.22136031091213226,
      "step": 4336,
      "step_time": 30.97024843469262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 139.25,
      "completions/mean_terminated_length": 139.25,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.30177614092826843,
      "epoch": 0.20088003705419175,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003893825225532055,
      "kl": 0.0022443159541580826,
      "learning_rate": 9.598332561371004e-07,
      "loss": 0.0001,
      "num_tokens": 119403354.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4337,
      "step_time": 17.57817819342017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 159.5625,
      "completions/mean_terminated_length": 159.5625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.18841608986258507,
      "epoch": 0.20092635479388607,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003976137842983007,
      "kl": 0.002540547284297645,
      "learning_rate": 9.598239925891615e-07,
      "loss": 0.0001,
      "num_tokens": 119429459.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 4338,
      "step_time": 20.13024763390422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 194.125,
      "completions/mean_terminated_length": 194.125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.37682662159204483,
      "epoch": 0.20097267253358037,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1389477550983429,
      "kl": 0.009646297781728208,
      "learning_rate": 9.598147290412228e-07,
      "loss": -0.0543,
      "num_tokens": 119460325.0,
      "reward": 0.12000277638435364,
      "reward_std": 0.32823479175567627,
      "rewards/reward_func/mean": 0.12000277638435364,
      "rewards/reward_func/std": 0.32823479175567627,
      "step": 4339,
      "step_time": 24.533297430723906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 134.25,
      "completions/mean_terminated_length": 134.25,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.24463994428515434,
      "epoch": 0.20101899027327466,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019469514954835176,
      "kl": 0.001599260256625712,
      "learning_rate": 9.59805465493284e-07,
      "loss": 0.0001,
      "num_tokens": 119482361.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4340,
      "step_time": 16.351763870567083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 192.3125,
      "completions/mean_terminated_length": 192.3125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.3754243478178978,
      "epoch": 0.20106530801296896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005679195746779442,
      "kl": 0.004411848145537078,
      "learning_rate": 9.59796201945345e-07,
      "loss": 0.0002,
      "num_tokens": 119513550.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4341,
      "step_time": 23.29446402937174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 341.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 259.25,
      "completions/mean_terminated_length": 259.25,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.2867109067738056,
      "epoch": 0.20111162575266328,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12100539356470108,
      "kl": 0.01235551736317575,
      "learning_rate": 9.597869383974062e-07,
      "loss": -0.0355,
      "num_tokens": 119552994.0,
      "reward": 0.6384425759315491,
      "reward_std": 0.2656787037849426,
      "rewards/reward_func/mean": 0.6384425759315491,
      "rewards/reward_func/std": 0.2656787037849426,
      "step": 4342,
      "step_time": 35.454101640731096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 159.3125,
      "completions/mean_terminated_length": 159.3125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.25620218738913536,
      "epoch": 0.20115794349235758,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003614285262301564,
      "kl": 0.0031151602743193507,
      "learning_rate": 9.597776748494673e-07,
      "loss": 0.0002,
      "num_tokens": 119573831.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4343,
      "step_time": 18.78634275868535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.2814929857850075,
      "epoch": 0.20120426123205187,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002326444024220109,
      "kl": 0.0017608084890525788,
      "learning_rate": 9.597684113015285e-07,
      "loss": 0.0001,
      "num_tokens": 119594688.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4344,
      "step_time": 15.133714221417904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 162.0625,
      "completions/mean_terminated_length": 162.0625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.3477717563509941,
      "epoch": 0.20125057897174617,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13272856175899506,
      "kl": 0.011347837978973985,
      "learning_rate": 9.597591477535896e-07,
      "loss": -0.0993,
      "num_tokens": 119617457.0,
      "reward": 0.09390753507614136,
      "reward_std": 0.12521004676818848,
      "rewards/reward_func/mean": 0.09390753507614136,
      "rewards/reward_func/std": 0.12521004676818848,
      "step": 4345,
      "step_time": 23.084840770810843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 205.25,
      "completions/mean_terminated_length": 205.25,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "entropy": 0.24499881267547607,
      "epoch": 0.2012968967114405,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1363125443458557,
      "kl": 0.008282593917101622,
      "learning_rate": 9.597498842056507e-07,
      "loss": -0.0143,
      "num_tokens": 119644821.0,
      "reward": 0.5969303250312805,
      "reward_std": 0.04848959296941757,
      "rewards/reward_func/mean": 0.5969303250312805,
      "rewards/reward_func/std": 0.04848960041999817,
      "step": 4346,
      "step_time": 24.36744337901473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 540.0,
      "completions/max_terminated_length": 540.0,
      "completions/mean_length": 333.1875,
      "completions/mean_terminated_length": 333.1875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.3103319816291332,
      "epoch": 0.2013432144511348,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08856040984392166,
      "kl": 0.01259826822206378,
      "learning_rate": 9.597406206577118e-07,
      "loss": 0.24,
      "num_tokens": 119672584.0,
      "reward": 0.3125,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.3125,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 4347,
      "step_time": 51.124805852770805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 205.625,
      "completions/mean_terminated_length": 205.625,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.19548982754349709,
      "epoch": 0.20138953219082908,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020060925744473934,
      "kl": 0.0017750268161762506,
      "learning_rate": 9.59731357109773e-07,
      "loss": 0.0001,
      "num_tokens": 119709186.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 4348,
      "step_time": 26.622994769364595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 128.125,
      "completions/mean_terminated_length": 128.125,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.32591327279806137,
      "epoch": 0.20143584993052338,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001764351618476212,
      "kl": 0.0015986388607416302,
      "learning_rate": 9.59722093561834e-07,
      "loss": 0.0001,
      "num_tokens": 119731092.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4349,
      "step_time": 17.040066741406918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 159.0,
      "completions/mean_terminated_length": 159.0,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.26894331723451614,
      "epoch": 0.2014821676702177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005290125031024218,
      "kl": 0.0035272493842057884,
      "learning_rate": 9.597128300138952e-07,
      "loss": 0.0002,
      "num_tokens": 119752404.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 4350,
      "step_time": 19.064855866134167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 149.9375,
      "completions/mean_terminated_length": 149.9375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.3027791827917099,
      "epoch": 0.201528485409912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002835450926795602,
      "kl": 0.0028183605172671378,
      "learning_rate": 9.597035664659563e-07,
      "loss": 0.0001,
      "num_tokens": 119783843.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4351,
      "step_time": 20.90618012100458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 181.25,
      "completions/mean_terminated_length": 181.25,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.15881448611617088,
      "epoch": 0.2015748031496063,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.168456569314003,
      "kl": 0.0038807641831226647,
      "learning_rate": 9.596943029180175e-07,
      "loss": -0.0727,
      "num_tokens": 119808903.0,
      "reward": 0.9300388693809509,
      "reward_std": 0.027310028672218323,
      "rewards/reward_func/mean": 0.9300388693809509,
      "rewards/reward_func/std": 0.027310030534863472,
      "step": 4352,
      "step_time": 24.106820344924927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 106.6875,
      "completions/mean_terminated_length": 106.6875,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.28208357840776443,
      "epoch": 0.2016211208893006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008142439648509026,
      "kl": 0.0031564083765260875,
      "learning_rate": 9.596850393700788e-07,
      "loss": 0.0002,
      "num_tokens": 119828738.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4353,
      "step_time": 15.13072595745325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 284.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 232.9375,
      "completions/mean_terminated_length": 232.9375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.387161947786808,
      "epoch": 0.20166743862899492,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08558937162160873,
      "kl": 0.012177588418126106,
      "learning_rate": 9.5967577582214e-07,
      "loss": 0.0151,
      "num_tokens": 119866625.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 4354,
      "step_time": 30.575301326811314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 138.5,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.30372022092342377,
      "epoch": 0.2017137563686892,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007324842736124992,
      "kl": 0.004446553764864802,
      "learning_rate": 9.596665122742008e-07,
      "loss": 0.0002,
      "num_tokens": 119886409.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4355,
      "step_time": 17.15417054668069
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 176.25,
      "completions/mean_terminated_length": 176.25,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3980333209037781,
      "epoch": 0.2017600741083835,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00380044081248343,
      "kl": 0.003151686047203839,
      "learning_rate": 9.596572487262622e-07,
      "loss": 0.0002,
      "num_tokens": 119926557.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4356,
      "step_time": 24.9662903547287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 120.0,
      "completions/mean_terminated_length": 120.0,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2784702852368355,
      "epoch": 0.2018063918480778,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031887576915323734,
      "kl": 0.0020782564824912697,
      "learning_rate": 9.596479851783233e-07,
      "loss": 0.0001,
      "num_tokens": 119946253.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4357,
      "step_time": 14.472920812666416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 374.0,
      "completions/max_terminated_length": 374.0,
      "completions/mean_length": 291.6875,
      "completions/mean_terminated_length": 291.6875,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "entropy": 0.1992134526371956,
      "epoch": 0.20185270958777213,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00861821137368679,
      "kl": 0.009224783629179,
      "learning_rate": 9.596387216303844e-07,
      "loss": 0.0005,
      "num_tokens": 119971800.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4358,
      "step_time": 35.20111673697829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 184.1875,
      "completions/mean_terminated_length": 184.1875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.21076007932424545,
      "epoch": 0.20189902732746642,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014462699182331562,
      "kl": 0.0011636416893452406,
      "learning_rate": 9.596294580824455e-07,
      "loss": 0.0001,
      "num_tokens": 120006475.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 4359,
      "step_time": 23.57875981926918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 180.875,
      "completions/mean_terminated_length": 180.875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.26818672940135,
      "epoch": 0.20194534506716072,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006156525108963251,
      "kl": 0.0054788870038464665,
      "learning_rate": 9.596201945345067e-07,
      "loss": 0.0003,
      "num_tokens": 120028025.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4360,
      "step_time": 23.37025962397456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 149.6875,
      "completions/mean_terminated_length": 149.6875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.22371386364102364,
      "epoch": 0.20199166280685502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031889823731034994,
      "kl": 0.0017451727471780032,
      "learning_rate": 9.596109309865678e-07,
      "loss": 0.0001,
      "num_tokens": 120048308.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4361,
      "step_time": 18.897894211113453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 165.75,
      "completions/mean_terminated_length": 165.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.1724872589111328,
      "epoch": 0.20203798054654934,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16300824284553528,
      "kl": 0.041608518455177546,
      "learning_rate": 9.59601667438629e-07,
      "loss": -0.0355,
      "num_tokens": 120069360.0,
      "reward": 0.6697071194648743,
      "reward_std": 0.2915422022342682,
      "rewards/reward_func/mean": 0.6697071194648743,
      "rewards/reward_func/std": 0.2915422320365906,
      "step": 4362,
      "step_time": 18.362178031355143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.1738715022802353,
      "epoch": 0.20208429828624364,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0047481004148721695,
      "kl": 0.013067428022623062,
      "learning_rate": 9.5959240389069e-07,
      "loss": 0.0007,
      "num_tokens": 120106428.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4363,
      "step_time": 23.938401725143194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 164.4375,
      "completions/mean_terminated_length": 164.4375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.16497112810611725,
      "epoch": 0.20213061602593793,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018727611750364304,
      "kl": 0.002182774478569627,
      "learning_rate": 9.595831403427512e-07,
      "loss": 0.0001,
      "num_tokens": 120127795.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 4364,
      "step_time": 18.761870093643665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 136.875,
      "completions/mean_terminated_length": 136.875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2960905507206917,
      "epoch": 0.20217693376563223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004074579104781151,
      "kl": 0.002672590548172593,
      "learning_rate": 9.595738767948123e-07,
      "loss": 0.0001,
      "num_tokens": 120149601.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4365,
      "step_time": 18.973962906748056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 348.0,
      "completions/max_terminated_length": 348.0,
      "completions/mean_length": 269.6875,
      "completions/mean_terminated_length": 269.6875,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "entropy": 0.29117827117443085,
      "epoch": 0.20222325150532655,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08026869595050812,
      "kl": 0.013203104259446263,
      "learning_rate": 9.595646132468736e-07,
      "loss": -0.0996,
      "num_tokens": 120182924.0,
      "reward": 0.42102187871932983,
      "reward_std": 0.41016829013824463,
      "rewards/reward_func/mean": 0.42102187871932983,
      "rewards/reward_func/std": 0.410168319940567,
      "step": 4366,
      "step_time": 36.51097435876727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 138.5625,
      "completions/mean_terminated_length": 138.5625,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.32694901525974274,
      "epoch": 0.20226956924502085,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00745930103585124,
      "kl": 0.005590903921984136,
      "learning_rate": 9.595553496989348e-07,
      "loss": 0.0003,
      "num_tokens": 120203861.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4367,
      "step_time": 18.598782904446125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 274.4375,
      "completions/mean_terminated_length": 274.4375,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "entropy": 0.27239735424518585,
      "epoch": 0.20231588698471514,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07638388127088547,
      "kl": 0.011131863575428724,
      "learning_rate": 9.595460861509957e-07,
      "loss": 0.0218,
      "num_tokens": 120243100.0,
      "reward": 0.9975948929786682,
      "reward_std": 0.009620373137295246,
      "rewards/reward_func/mean": 0.9975948929786682,
      "rewards/reward_func/std": 0.009620368480682373,
      "step": 4368,
      "step_time": 34.29851580038667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 151.0625,
      "completions/mean_terminated_length": 151.0625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.2635408937931061,
      "epoch": 0.20236220472440944,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002637336030602455,
      "kl": 0.0023205436300486326,
      "learning_rate": 9.59536822603057e-07,
      "loss": 0.0001,
      "num_tokens": 120263405.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4369,
      "step_time": 17.772504441440105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 190.3125,
      "completions/mean_terminated_length": 190.3125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.4073800668120384,
      "epoch": 0.20240852246410376,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00430545536801219,
      "kl": 0.0041122655384242535,
      "learning_rate": 9.595275590551181e-07,
      "loss": 0.0002,
      "num_tokens": 120305474.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4370,
      "step_time": 27.753872349858284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 136.6875,
      "completions/mean_terminated_length": 136.6875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.21981431543827057,
      "epoch": 0.20245484020379806,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020968785975128412,
      "kl": 0.0012202737852931023,
      "learning_rate": 9.595182955071793e-07,
      "loss": 0.0001,
      "num_tokens": 120325773.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4371,
      "step_time": 18.225017122924328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 128.6875,
      "completions/mean_terminated_length": 128.6875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.25608067214488983,
      "epoch": 0.20250115794349235,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00358181307092309,
      "kl": 0.002328541479073465,
      "learning_rate": 9.595090319592404e-07,
      "loss": 0.0001,
      "num_tokens": 120348792.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4372,
      "step_time": 17.058714006096125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 186.9375,
      "completions/mean_terminated_length": 186.9375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.3649733439087868,
      "epoch": 0.20254747568318665,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1176580861210823,
      "kl": 0.00996137852780521,
      "learning_rate": 9.594997684113015e-07,
      "loss": -0.033,
      "num_tokens": 120370519.0,
      "reward": 0.05787256732583046,
      "reward_std": 0.23149026930332184,
      "rewards/reward_func/mean": 0.05787256732583046,
      "rewards/reward_func/std": 0.23149026930332184,
      "step": 4373,
      "step_time": 22.45996080338955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 149.1875,
      "completions/mean_terminated_length": 149.1875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.4010501429438591,
      "epoch": 0.20259379342288097,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002867297036573291,
      "kl": 0.0027788946172222495,
      "learning_rate": 9.594905048633626e-07,
      "loss": 0.0001,
      "num_tokens": 120423290.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4374,
      "step_time": 25.786354899406433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 155.8125,
      "completions/mean_terminated_length": 155.8125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.30683937668800354,
      "epoch": 0.20264011116257527,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026754315476864576,
      "kl": 0.002054195530945435,
      "learning_rate": 9.594812413154238e-07,
      "loss": 0.0001,
      "num_tokens": 120453703.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4375,
      "step_time": 21.604269791394472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 197.25,
      "completions/mean_terminated_length": 197.25,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.20628655701875687,
      "epoch": 0.20268642890226957,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010255440138280392,
      "kl": 0.009761344874277711,
      "learning_rate": 9.594719777674849e-07,
      "loss": 0.0005,
      "num_tokens": 120475643.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4376,
      "step_time": 21.476807940751314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 131.5,
      "completions/mean_terminated_length": 131.5,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.23010990768671036,
      "epoch": 0.20273274664196386,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007033254019916058,
      "kl": 0.00605582888238132,
      "learning_rate": 9.59462714219546e-07,
      "loss": 0.0003,
      "num_tokens": 120503939.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4377,
      "step_time": 18.441113721579313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 253.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 199.0625,
      "completions/mean_terminated_length": 199.0625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.2417679913341999,
      "epoch": 0.2027790643816582,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003662134986370802,
      "kl": 0.00411995907779783,
      "learning_rate": 9.594534506716071e-07,
      "loss": 0.0002,
      "num_tokens": 120531300.0,
      "reward": 0.7577395439147949,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7577395439147949,
      "rewards/reward_func/std": 0.0,
      "step": 4378,
      "step_time": 26.69351141527295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 146.0,
      "completions/mean_terminated_length": 146.0,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.16164137795567513,
      "epoch": 0.20282538212135248,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006584150716662407,
      "kl": 0.003201333398465067,
      "learning_rate": 9.594441871236685e-07,
      "loss": 0.0002,
      "num_tokens": 120551668.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 4379,
      "step_time": 18.956845924258232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 219.4375,
      "completions/mean_terminated_length": 219.4375,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "entropy": 0.29696741700172424,
      "epoch": 0.20287169986104678,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014850565232336521,
      "kl": 0.017927177250385284,
      "learning_rate": 9.594349235757294e-07,
      "loss": 0.0009,
      "num_tokens": 120573419.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4380,
      "step_time": 23.3524604216218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 157.5,
      "completions/mean_terminated_length": 157.5,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.35661375522613525,
      "epoch": 0.20291801760074107,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014806061051785946,
      "kl": 0.013286509085446596,
      "learning_rate": 9.594256600277905e-07,
      "loss": 0.0007,
      "num_tokens": 120598387.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4381,
      "step_time": 19.596401181071997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.33992859721183777,
      "epoch": 0.2029643353404354,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00979616492986679,
      "kl": 0.007574495393782854,
      "learning_rate": 9.594163964798516e-07,
      "loss": 0.0004,
      "num_tokens": 120620113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4382,
      "step_time": 19.149428606033325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 128.5625,
      "completions/mean_terminated_length": 128.5625,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2978258281946182,
      "epoch": 0.2030106530801297,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005817455239593983,
      "kl": 0.003650028840638697,
      "learning_rate": 9.59407132931913e-07,
      "loss": 0.0002,
      "num_tokens": 120641290.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4383,
      "step_time": 17.43656163290143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 114.8125,
      "completions/mean_terminated_length": 114.8125,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.27465222030878067,
      "epoch": 0.203056970819824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002690394874662161,
      "kl": 0.0017183998716063797,
      "learning_rate": 9.59397869383974e-07,
      "loss": 0.0001,
      "num_tokens": 120660855.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4384,
      "step_time": 14.342736564576626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 199.6875,
      "completions/mean_terminated_length": 199.6875,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.2033468410372734,
      "epoch": 0.20310328855951829,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011636612005531788,
      "kl": 0.008831958984956145,
      "learning_rate": 9.593886058360352e-07,
      "loss": 0.0004,
      "num_tokens": 120682306.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4385,
      "step_time": 21.862970259040594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 156.5625,
      "completions/mean_terminated_length": 156.5625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.1484076865017414,
      "epoch": 0.2031496062992126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004241873510181904,
      "kl": 0.0027250301791355014,
      "learning_rate": 9.593793422880963e-07,
      "loss": 0.0001,
      "num_tokens": 120704619.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 4386,
      "step_time": 21.26169503480196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 136.6875,
      "completions/mean_terminated_length": 136.6875,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2334960661828518,
      "epoch": 0.2031959240389069,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003352184547111392,
      "kl": 0.0019840349268633872,
      "learning_rate": 9.593700787401575e-07,
      "loss": 0.0001,
      "num_tokens": 120724310.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4387,
      "step_time": 16.824626177549362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 131.0,
      "completions/mean_length": 111.25,
      "completions/mean_terminated_length": 111.25,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2790839448571205,
      "epoch": 0.2032422417786012,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002758890390396118,
      "kl": 0.001983508700504899,
      "learning_rate": 9.593608151922186e-07,
      "loss": 0.0001,
      "num_tokens": 120744026.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4388,
      "step_time": 14.307689350098372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 218.9375,
      "completions/mean_terminated_length": 218.9375,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "entropy": 0.17713632434606552,
      "epoch": 0.2032885595182955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003758077509701252,
      "kl": 0.003175067831762135,
      "learning_rate": 9.593515516442797e-07,
      "loss": 0.0002,
      "num_tokens": 120773817.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4389,
      "step_time": 25.04056917130947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 210.25,
      "completions/mean_terminated_length": 210.25,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.18416766449809074,
      "epoch": 0.20333487725798982,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02285883016884327,
      "kl": 0.0019181863754056394,
      "learning_rate": 9.593422880963408e-07,
      "loss": 0.0001,
      "num_tokens": 120809213.0,
      "reward": 0.951229453086853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.951229453086853,
      "rewards/reward_func/std": 0.0,
      "step": 4390,
      "step_time": 26.845209203660488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 130.6875,
      "completions/mean_terminated_length": 130.6875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.30053722113370895,
      "epoch": 0.20338119499768412,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003516019554808736,
      "kl": 0.002620826126076281,
      "learning_rate": 9.59333024548402e-07,
      "loss": 0.0001,
      "num_tokens": 120838696.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4391,
      "step_time": 18.624075073748827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 180.875,
      "completions/mean_terminated_length": 180.875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.434813991189003,
      "epoch": 0.2034275127373784,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037418357096612453,
      "kl": 0.00333327054977417,
      "learning_rate": 9.59323761000463e-07,
      "loss": 0.0002,
      "num_tokens": 120866854.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4392,
      "step_time": 22.500424940139055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 156.1875,
      "completions/mean_terminated_length": 156.1875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.22748274356126785,
      "epoch": 0.2034738304770727,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002320576226338744,
      "kl": 0.0017496794753242284,
      "learning_rate": 9.593144974525242e-07,
      "loss": 0.0001,
      "num_tokens": 120893721.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 4393,
      "step_time": 18.4317070171237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 220.3125,
      "completions/mean_terminated_length": 220.3125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.24937263131141663,
      "epoch": 0.20352014821676703,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09236271679401398,
      "kl": 0.006573219550773501,
      "learning_rate": 9.593052339045853e-07,
      "loss": -0.0487,
      "num_tokens": 120917390.0,
      "reward": 0.2537243962287903,
      "reward_std": 0.065545953810215,
      "rewards/reward_func/mean": 0.2537243962287903,
      "rewards/reward_func/std": 0.065545953810215,
      "step": 4394,
      "step_time": 28.78863863646984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 209.125,
      "completions/mean_terminated_length": 209.125,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.17656762152910233,
      "epoch": 0.20356646595646133,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0057213762775063515,
      "kl": 0.04755854792892933,
      "learning_rate": 9.592959703566465e-07,
      "loss": 0.0024,
      "num_tokens": 120955376.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4395,
      "step_time": 27.040158040821552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 116.5,
      "completions/mean_terminated_length": 116.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.2675146237015724,
      "epoch": 0.20361278369615562,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023586463648825884,
      "kl": 0.0017603501328267157,
      "learning_rate": 9.592867068087078e-07,
      "loss": 0.0001,
      "num_tokens": 120978520.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4396,
      "step_time": 16.30543616786599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 189.9375,
      "completions/mean_terminated_length": 189.9375,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.18880921602249146,
      "epoch": 0.20365910143584992,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007340978365391493,
      "kl": 0.005150568729732186,
      "learning_rate": 9.59277443260769e-07,
      "loss": 0.0003,
      "num_tokens": 121006727.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4397,
      "step_time": 22.45252402871847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 218.3125,
      "completions/mean_terminated_length": 218.3125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.15946005657315254,
      "epoch": 0.20370541917554424,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1293276995420456,
      "kl": 0.03771508112549782,
      "learning_rate": 9.592681797128298e-07,
      "loss": -0.0428,
      "num_tokens": 121037068.0,
      "reward": 0.9918040037155151,
      "reward_std": 0.01466143038123846,
      "rewards/reward_func/mean": 0.9918040037155151,
      "rewards/reward_func/std": 0.014661417342722416,
      "step": 4398,
      "step_time": 26.476797968149185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 147.5,
      "completions/mean_terminated_length": 147.5,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.4311412423849106,
      "epoch": 0.20375173691523854,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017392414156347513,
      "kl": 0.002208363323006779,
      "learning_rate": 9.592589161648912e-07,
      "loss": 0.0001,
      "num_tokens": 121087188.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4399,
      "step_time": 25.522712852805853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 153.6875,
      "completions/mean_terminated_length": 153.6875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.15880463644862175,
      "epoch": 0.20379805465493284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029165507294237614,
      "kl": 0.0038405574741773307,
      "learning_rate": 9.592496526169523e-07,
      "loss": 0.0002,
      "num_tokens": 121110943.0,
      "reward": 0.8337529301643372,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8337529301643372,
      "rewards/reward_func/std": 0.0,
      "step": 4400,
      "step_time": 18.827107544988394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 182.1875,
      "completions/mean_terminated_length": 182.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.1975412853062153,
      "epoch": 0.20384437239462713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07105456292629242,
      "kl": 0.0011213233519811183,
      "learning_rate": 9.592403890690134e-07,
      "loss": 0.0113,
      "num_tokens": 121141874.0,
      "reward": 0.8560765981674194,
      "reward_std": 0.038379572331905365,
      "rewards/reward_func/mean": 0.8560765981674194,
      "rewards/reward_func/std": 0.03837956488132477,
      "step": 4401,
      "step_time": 23.41580117121339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 143.625,
      "completions/mean_terminated_length": 143.625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.18176880106329918,
      "epoch": 0.20389069013432146,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13676771521568298,
      "kl": 0.0044325272319838405,
      "learning_rate": 9.592311255210746e-07,
      "loss": -0.0306,
      "num_tokens": 121163100.0,
      "reward": 0.884931206703186,
      "reward_std": 0.09087805449962616,
      "rewards/reward_func/mean": 0.884931206703186,
      "rewards/reward_func/std": 0.09087805449962616,
      "step": 4402,
      "step_time": 16.363803543150425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 188.125,
      "completions/mean_terminated_length": 188.125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.449375756084919,
      "epoch": 0.20393700787401575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11944127082824707,
      "kl": 0.0065049066906794906,
      "learning_rate": 9.592218619731357e-07,
      "loss": -0.1757,
      "num_tokens": 121196910.0,
      "reward": 0.002497798530384898,
      "reward_std": 0.005370105616748333,
      "rewards/reward_func/mean": 0.002497798530384898,
      "rewards/reward_func/std": 0.005370105616748333,
      "step": 4403,
      "step_time": 30.80251520872116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 146.4375,
      "completions/mean_terminated_length": 146.4375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3295118361711502,
      "epoch": 0.20398332561371005,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003453849582001567,
      "kl": 0.0028061120538040996,
      "learning_rate": 9.592125984251968e-07,
      "loss": 0.0001,
      "num_tokens": 121218069.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4404,
      "step_time": 19.674273550510406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 203.1875,
      "completions/mean_terminated_length": 203.1875,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.44458911567926407,
      "epoch": 0.20402964335340434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003419496351853013,
      "kl": 0.003455746453255415,
      "learning_rate": 9.59203334877258e-07,
      "loss": 0.0002,
      "num_tokens": 121241576.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4405,
      "step_time": 24.537865091115236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 180.0625,
      "completions/mean_terminated_length": 180.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3620297908782959,
      "epoch": 0.20407596109309867,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036682465579360723,
      "kl": 0.0029127378948032856,
      "learning_rate": 9.59194071329319e-07,
      "loss": 0.0001,
      "num_tokens": 121263641.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4406,
      "step_time": 20.35015856102109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 291.875,
      "completions/mean_terminated_length": 291.875,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "entropy": 0.1944119967520237,
      "epoch": 0.20412227883279296,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07078786939382553,
      "kl": 0.0031040938338264823,
      "learning_rate": 9.591848077813802e-07,
      "loss": -0.0229,
      "num_tokens": 121300679.0,
      "reward": 0.5942342281341553,
      "reward_std": 0.12828125059604645,
      "rewards/reward_func/mean": 0.5942342281341553,
      "rewards/reward_func/std": 0.12828125059604645,
      "step": 4407,
      "step_time": 33.402572583407164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 464.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 300.4375,
      "completions/mean_terminated_length": 300.4375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.41832730174064636,
      "epoch": 0.20416859657248726,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08045594394207001,
      "kl": 0.011987762525677681,
      "learning_rate": 9.591755442334413e-07,
      "loss": -0.2775,
      "num_tokens": 121341694.0,
      "reward": 0.42598751187324524,
      "reward_std": 0.4988654851913452,
      "rewards/reward_func/mean": 0.42598751187324524,
      "rewards/reward_func/std": 0.4988655149936676,
      "step": 4408,
      "step_time": 45.18706896901131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 158.5,
      "completions/mean_terminated_length": 158.5,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.17929061502218246,
      "epoch": 0.20421491431218156,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010240025818347931,
      "kl": 0.025862340815365314,
      "learning_rate": 9.591662806855026e-07,
      "loss": 0.0013,
      "num_tokens": 121363094.0,
      "reward": 0.894839346408844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.894839346408844,
      "rewards/reward_func/std": 0.0,
      "step": 4409,
      "step_time": 21.9126313701272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 226.1875,
      "completions/mean_terminated_length": 226.1875,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.31503815203905106,
      "epoch": 0.20426123205187588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11403704434633255,
      "kl": 0.005874601716641337,
      "learning_rate": 9.591570171375638e-07,
      "loss": 0.0202,
      "num_tokens": 121409081.0,
      "reward": 0.8776825666427612,
      "reward_std": 0.33423489332199097,
      "rewards/reward_func/mean": 0.8776825666427612,
      "rewards/reward_func/std": 0.33423489332199097,
      "step": 4410,
      "step_time": 29.934467788785696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 293.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 231.3125,
      "completions/mean_terminated_length": 231.3125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.4675322622060776,
      "epoch": 0.20430754979157018,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12299149483442307,
      "kl": 0.006061073509044945,
      "learning_rate": 9.591477535896247e-07,
      "loss": 0.1042,
      "num_tokens": 121433182.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4411,
      "step_time": 28.184160079807043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 151.5,
      "completions/mean_terminated_length": 151.5,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3533034175634384,
      "epoch": 0.20435386753126447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003036539303138852,
      "kl": 0.002608212234918028,
      "learning_rate": 9.591384900416858e-07,
      "loss": 0.0001,
      "num_tokens": 121469414.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4412,
      "step_time": 23.193986248224974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 194.3125,
      "completions/mean_terminated_length": 194.3125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.4515443667769432,
      "epoch": 0.20440018527095877,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011975867673754692,
      "kl": 0.00841202971059829,
      "learning_rate": 9.591292264937471e-07,
      "loss": 0.0004,
      "num_tokens": 121491451.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4413,
      "step_time": 22.457041680812836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 269.0,
      "completions/max_terminated_length": 269.0,
      "completions/mean_length": 233.3125,
      "completions/mean_terminated_length": 233.3125,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.3064805567264557,
      "epoch": 0.2044465030106531,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1058402955532074,
      "kl": 0.024243885651230812,
      "learning_rate": 9.591199629458083e-07,
      "loss": -0.0399,
      "num_tokens": 121515696.0,
      "reward": 0.5222058892250061,
      "reward_std": 0.363617479801178,
      "rewards/reward_func/mean": 0.5222058892250061,
      "rewards/reward_func/std": 0.36361750960350037,
      "step": 4414,
      "step_time": 26.59234295785427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 172.5,
      "completions/mean_terminated_length": 172.5,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4450515806674957,
      "epoch": 0.2044928207503474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030375299975275993,
      "kl": 0.0031095953891053796,
      "learning_rate": 9.591106993978694e-07,
      "loss": 0.0002,
      "num_tokens": 121550088.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4415,
      "step_time": 25.035363737493753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 145.375,
      "completions/mean_terminated_length": 145.375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.25684917718172073,
      "epoch": 0.20453913849004168,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001872918801382184,
      "kl": 0.0014846816484350711,
      "learning_rate": 9.591014358499305e-07,
      "loss": 0.0001,
      "num_tokens": 121571486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4416,
      "step_time": 16.55630274116993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2973170876502991,
      "epoch": 0.20458545622973598,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0056549739092588425,
      "kl": 0.002468589402269572,
      "learning_rate": 9.590921723019916e-07,
      "loss": 0.0001,
      "num_tokens": 121594534.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4417,
      "step_time": 16.745481088757515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 232.5625,
      "completions/mean_terminated_length": 232.5625,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "entropy": 0.22854237258434296,
      "epoch": 0.2046317739694303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0073995441198349,
      "kl": 0.004934438620693982,
      "learning_rate": 9.590829087540528e-07,
      "loss": 0.0002,
      "num_tokens": 121633423.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4418,
      "step_time": 29.776594549417496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 161.5,
      "completions/mean_terminated_length": 161.5,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3942725211381912,
      "epoch": 0.2046780917091246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015917306765913963,
      "kl": 0.008968879468739033,
      "learning_rate": 9.590736452061139e-07,
      "loss": 0.0005,
      "num_tokens": 121657847.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4419,
      "step_time": 21.063758365809917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 158.0625,
      "completions/mean_terminated_length": 158.0625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.19162816181778908,
      "epoch": 0.2047244094488189,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00394805520772934,
      "kl": 0.0029846797697246075,
      "learning_rate": 9.59064381658175e-07,
      "loss": 0.0001,
      "num_tokens": 121678472.0,
      "reward": 0.8702397346496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8702397346496582,
      "rewards/reward_func/std": 0.0,
      "step": 4420,
      "step_time": 19.00661974772811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 252.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 174.4375,
      "completions/mean_terminated_length": 174.4375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.2606373205780983,
      "epoch": 0.2047707271885132,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004122025799006224,
      "kl": 0.0044886431423947215,
      "learning_rate": 9.590551181102361e-07,
      "loss": 0.0002,
      "num_tokens": 121701439.0,
      "reward": 0.31414684653282166,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.31414684653282166,
      "rewards/reward_func/std": 0.0,
      "step": 4421,
      "step_time": 24.56281067058444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 179.5,
      "completions/mean_terminated_length": 179.5,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.20721253752708435,
      "epoch": 0.20481704492820751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004045045934617519,
      "kl": 0.0024959520087577403,
      "learning_rate": 9.590458545622973e-07,
      "loss": 0.0001,
      "num_tokens": 121738759.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4422,
      "step_time": 24.778950180858374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 147.4375,
      "completions/mean_terminated_length": 147.4375,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.2940957322716713,
      "epoch": 0.2048633626679018,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042904941365122795,
      "kl": 0.002092456095851958,
      "learning_rate": 9.590365910143584e-07,
      "loss": 0.0001,
      "num_tokens": 121766334.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4423,
      "step_time": 19.694414857774973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 269.0,
      "completions/max_terminated_length": 269.0,
      "completions/mean_length": 220.1875,
      "completions/mean_terminated_length": 220.1875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.38973531872034073,
      "epoch": 0.2049096804075961,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1366364061832428,
      "kl": 0.014282828662544489,
      "learning_rate": 9.590273274664195e-07,
      "loss": -0.0137,
      "num_tokens": 121788817.0,
      "reward": 0.15132294595241547,
      "reward_std": 0.3253345489501953,
      "rewards/reward_func/mean": 0.15132294595241547,
      "rewards/reward_func/std": 0.3253345489501953,
      "step": 4424,
      "step_time": 27.229420375078917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 142.25,
      "completions/mean_terminated_length": 142.25,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.18300453945994377,
      "epoch": 0.2049559981472904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002770983148366213,
      "kl": 0.0026137664972338825,
      "learning_rate": 9.590180639184806e-07,
      "loss": 0.0001,
      "num_tokens": 121816197.0,
      "reward": 0.9428731203079224,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9428731203079224,
      "rewards/reward_func/std": 0.0,
      "step": 4425,
      "step_time": 20.217567820101976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 162.625,
      "completions/mean_terminated_length": 162.625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.3745696395635605,
      "epoch": 0.20500231588698473,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027576652355492115,
      "kl": 0.0023471819586120546,
      "learning_rate": 9.59008800370542e-07,
      "loss": 0.0001,
      "num_tokens": 121854255.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4426,
      "step_time": 22.342611964792013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 349.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 227.4375,
      "completions/mean_terminated_length": 227.4375,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.37403204292058945,
      "epoch": 0.20504863362667902,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14391353726387024,
      "kl": 0.02081705490127206,
      "learning_rate": 9.58999536822603e-07,
      "loss": -0.1185,
      "num_tokens": 121881622.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 4427,
      "step_time": 34.336918134242296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 176.6875,
      "completions/mean_terminated_length": 176.6875,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.3281799256801605,
      "epoch": 0.20509495136637332,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13997496664524078,
      "kl": 0.0050591142498888075,
      "learning_rate": 9.589902732746642e-07,
      "loss": -0.04,
      "num_tokens": 121912241.0,
      "reward": 0.6342861652374268,
      "reward_std": 0.4144529700279236,
      "rewards/reward_func/mean": 0.6342861652374268,
      "rewards/reward_func/std": 0.41445299983024597,
      "step": 4428,
      "step_time": 24.334660917520523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 207.0625,
      "completions/mean_terminated_length": 207.0625,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.40722689777612686,
      "epoch": 0.20514126910606761,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.09576158970594406,
      "kl": 0.017346865322906524,
      "learning_rate": 9.589810097267253e-07,
      "loss": 0.0008,
      "num_tokens": 121942034.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4429,
      "step_time": 26.369808048009872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 231.0,
      "completions/max_terminated_length": 231.0,
      "completions/mean_length": 183.5,
      "completions/mean_terminated_length": 183.5,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4024273753166199,
      "epoch": 0.20518758684576194,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029543309938162565,
      "kl": 0.0025500337942503393,
      "learning_rate": 9.589717461787865e-07,
      "loss": 0.0001,
      "num_tokens": 121966858.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4430,
      "step_time": 23.2380328476429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 177.8125,
      "completions/mean_terminated_length": 177.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.37463711202144623,
      "epoch": 0.20523390458545623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014829622581601143,
      "kl": 0.008776698727160692,
      "learning_rate": 9.589624826308476e-07,
      "loss": 0.0004,
      "num_tokens": 121989991.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4431,
      "step_time": 19.534911889582872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 157.625,
      "completions/mean_terminated_length": 157.625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.3643925115466118,
      "epoch": 0.20528022232515053,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006356745958328247,
      "kl": 0.004867341020144522,
      "learning_rate": 9.589532190829087e-07,
      "loss": 0.0002,
      "num_tokens": 122016609.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4432,
      "step_time": 20.41070855781436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 139.0,
      "completions/mean_terminated_length": 139.0,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2526129074394703,
      "epoch": 0.20532654006484483,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018666721880435944,
      "kl": 0.0015373661299236119,
      "learning_rate": 9.589439555349698e-07,
      "loss": 0.0001,
      "num_tokens": 122037409.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4433,
      "step_time": 17.788786247372627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 223.375,
      "completions/mean_terminated_length": 223.375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.4167468100786209,
      "epoch": 0.20537285780453915,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13366052508354187,
      "kl": 0.015270611038431525,
      "learning_rate": 9.58934691987031e-07,
      "loss": -0.0291,
      "num_tokens": 122060023.0,
      "reward": 0.12705472111701965,
      "reward_std": 0.15925611555576324,
      "rewards/reward_func/mean": 0.12705472111701965,
      "rewards/reward_func/std": 0.15925611555576324,
      "step": 4434,
      "step_time": 24.816624749451876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 120.0,
      "completions/mean_terminated_length": 120.0,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2214369997382164,
      "epoch": 0.20541917554423345,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005897964350879192,
      "kl": 0.002670010202564299,
      "learning_rate": 9.58925428439092e-07,
      "loss": 0.0001,
      "num_tokens": 122079367.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4435,
      "step_time": 14.795745197683573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 452.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 345.875,
      "completions/mean_terminated_length": 345.875,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "entropy": 0.23983752354979515,
      "epoch": 0.20546549328392774,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08564194291830063,
      "kl": 0.006884301896207035,
      "learning_rate": 9.589161648911532e-07,
      "loss": -0.027,
      "num_tokens": 122120389.0,
      "reward": 0.8980263471603394,
      "reward_std": 0.10627995431423187,
      "rewards/reward_func/mean": 0.8980263471603394,
      "rewards/reward_func/std": 0.10627996176481247,
      "step": 4436,
      "step_time": 44.15879878401756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 207.25,
      "completions/mean_terminated_length": 207.25,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "entropy": 0.1434022542089224,
      "epoch": 0.20551181102362204,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010433053597807884,
      "kl": 0.005398858338594437,
      "learning_rate": 9.589069013432143e-07,
      "loss": 0.0003,
      "num_tokens": 122144441.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4437,
      "step_time": 22.41115650907159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 135.125,
      "completions/mean_terminated_length": 135.125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2852722741663456,
      "epoch": 0.20555812876331636,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004220884758979082,
      "kl": 0.0028225001879036427,
      "learning_rate": 9.588976377952755e-07,
      "loss": 0.0001,
      "num_tokens": 122172059.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4438,
      "step_time": 17.71949626505375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 124.6875,
      "completions/mean_terminated_length": 124.6875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.25068046152591705,
      "epoch": 0.20560444650301066,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003623596392571926,
      "kl": 0.0022179295192472637,
      "learning_rate": 9.588883742473368e-07,
      "loss": 0.0001,
      "num_tokens": 122192678.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4439,
      "step_time": 14.5108128413558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 363.0,
      "completions/max_terminated_length": 363.0,
      "completions/mean_length": 271.625,
      "completions/mean_terminated_length": 271.625,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.31101303547620773,
      "epoch": 0.20565076424270495,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10069181025028229,
      "kl": 0.02580390591174364,
      "learning_rate": 9.58879110699398e-07,
      "loss": -0.1711,
      "num_tokens": 122225856.0,
      "reward": 0.5459882020950317,
      "reward_std": 0.4171529710292816,
      "rewards/reward_func/mean": 0.5459882020950317,
      "rewards/reward_func/std": 0.417153000831604,
      "step": 4440,
      "step_time": 36.09346652403474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 131.9375,
      "completions/mean_terminated_length": 131.9375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3213750869035721,
      "epoch": 0.20569708198239925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022330954670906067,
      "kl": 0.0018797389639075845,
      "learning_rate": 9.588698471514588e-07,
      "loss": 0.0001,
      "num_tokens": 122245503.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4441,
      "step_time": 16.572161994874477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 138.625,
      "completions/mean_terminated_length": 138.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3204464539885521,
      "epoch": 0.20574339972209357,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026541727129369974,
      "kl": 0.001986485061934218,
      "learning_rate": 9.5886058360352e-07,
      "loss": 0.0001,
      "num_tokens": 122266489.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4442,
      "step_time": 18.283427093178034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 351.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 315.25,
      "completions/mean_terminated_length": 315.25,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "entropy": 0.2674109488725662,
      "epoch": 0.20578971746178787,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00503187533468008,
      "kl": 0.016199012519791722,
      "learning_rate": 9.588513200555813e-07,
      "loss": 0.0008,
      "num_tokens": 122299581.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4443,
      "step_time": 35.6751859895885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 119.25,
      "completions/mean_terminated_length": 119.25,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "entropy": 0.2758583277463913,
      "epoch": 0.20583603520148216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003130985889583826,
      "kl": 0.001854931062553078,
      "learning_rate": 9.588420565076424e-07,
      "loss": 0.0001,
      "num_tokens": 122319953.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4444,
      "step_time": 15.612978029996157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 187.6875,
      "completions/mean_terminated_length": 187.6875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.40260956436395645,
      "epoch": 0.20588235294117646,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008119883015751839,
      "kl": 0.005284482031129301,
      "learning_rate": 9.588327929597036e-07,
      "loss": 0.0003,
      "num_tokens": 122347340.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4445,
      "step_time": 23.433291722089052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 149.1875,
      "completions/mean_terminated_length": 149.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.23468093946576118,
      "epoch": 0.20592867068087078,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19637228548526764,
      "kl": 0.013076600152999163,
      "learning_rate": 9.588235294117647e-07,
      "loss": 0.0845,
      "num_tokens": 122367967.0,
      "reward": 0.6438281536102295,
      "reward_std": 0.28493744134902954,
      "rewards/reward_func/mean": 0.6438281536102295,
      "rewards/reward_func/std": 0.28493744134902954,
      "step": 4446,
      "step_time": 18.772852525115013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 115.75,
      "completions/mean_terminated_length": 115.75,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.23649794608354568,
      "epoch": 0.20597498842056508,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009086528792977333,
      "kl": 0.003549997869413346,
      "learning_rate": 9.588142658638258e-07,
      "loss": 0.0002,
      "num_tokens": 122387083.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4447,
      "step_time": 13.733539011329412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 395.0,
      "completions/max_terminated_length": 395.0,
      "completions/mean_length": 234.6875,
      "completions/mean_terminated_length": 234.6875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3815370425581932,
      "epoch": 0.20602130616025938,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009284532628953457,
      "kl": 0.00850689155049622,
      "learning_rate": 9.58805002315887e-07,
      "loss": 0.0004,
      "num_tokens": 122408422.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4448,
      "step_time": 34.92842309176922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 162.25,
      "completions/mean_terminated_length": 162.25,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.16739467531442642,
      "epoch": 0.20606762389995367,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07969807088375092,
      "kl": 0.0027272370643913746,
      "learning_rate": 9.58795738767948e-07,
      "loss": 0.0051,
      "num_tokens": 122432554.0,
      "reward": 0.9014118909835815,
      "reward_std": 0.02629014663398266,
      "rewards/reward_func/mean": 0.9014118909835815,
      "rewards/reward_func/std": 0.026290163397789,
      "step": 4449,
      "step_time": 19.270759791135788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 168.375,
      "completions/mean_terminated_length": 168.375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.17201659083366394,
      "epoch": 0.206113941639648,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005692894570529461,
      "kl": 0.0036805764539167285,
      "learning_rate": 9.587864752200092e-07,
      "loss": 0.0002,
      "num_tokens": 122464800.0,
      "reward": 0.964915931224823,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.964915931224823,
      "rewards/reward_func/std": 0.0,
      "step": 4450,
      "step_time": 22.921984735876322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 130.5,
      "completions/mean_terminated_length": 130.5,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.32443325221538544,
      "epoch": 0.2061602593793423,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003495546290650964,
      "kl": 0.0022235424839891493,
      "learning_rate": 9.587772116720703e-07,
      "loss": 0.0001,
      "num_tokens": 122487272.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4451,
      "step_time": 16.174052443355322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 214.0625,
      "completions/mean_terminated_length": 214.0625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.38778331875801086,
      "epoch": 0.2062065771190366,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1335936337709427,
      "kl": 0.014234495349228382,
      "learning_rate": 9.587679481241314e-07,
      "loss": -0.0268,
      "num_tokens": 122520505.0,
      "reward": 0.03813140466809273,
      "reward_std": 0.09032773971557617,
      "rewards/reward_func/mean": 0.03813140466809273,
      "rewards/reward_func/std": 0.09032774716615677,
      "step": 4452,
      "step_time": 26.763722702860832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 154.1875,
      "completions/mean_terminated_length": 154.1875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.347697950899601,
      "epoch": 0.20625289485873088,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033534392714500427,
      "kl": 0.0034794610110111535,
      "learning_rate": 9.587586845761928e-07,
      "loss": 0.0002,
      "num_tokens": 122540764.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4453,
      "step_time": 19.237523522228003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 131.9375,
      "completions/mean_terminated_length": 131.9375,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.2027127742767334,
      "epoch": 0.2062992125984252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004297061823308468,
      "kl": 0.002365048392675817,
      "learning_rate": 9.587494210282537e-07,
      "loss": 0.0001,
      "num_tokens": 122560331.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4454,
      "step_time": 15.467692468315363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 178.4375,
      "completions/mean_terminated_length": 178.4375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.22610027343034744,
      "epoch": 0.2063455303381195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012379267252981663,
      "kl": 0.009055770700797439,
      "learning_rate": 9.587401574803148e-07,
      "loss": 0.0005,
      "num_tokens": 122581730.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4455,
      "step_time": 19.87071193009615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 214.3125,
      "completions/mean_terminated_length": 214.3125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "entropy": 0.33952800184488297,
      "epoch": 0.2063918480778138,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10389196127653122,
      "kl": 0.027087272610515356,
      "learning_rate": 9.587308939323761e-07,
      "loss": -0.0913,
      "num_tokens": 122605719.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 4456,
      "step_time": 26.03425007686019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 211.9375,
      "completions/mean_terminated_length": 211.9375,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.184910599142313,
      "epoch": 0.2064381658175081,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09489887207746506,
      "kl": 0.02374209789559245,
      "learning_rate": 9.587216303844373e-07,
      "loss": -0.0467,
      "num_tokens": 122630902.0,
      "reward": 0.7882111072540283,
      "reward_std": 0.16943113505840302,
      "rewards/reward_func/mean": 0.7882111072540283,
      "rewards/reward_func/std": 0.1694311648607254,
      "step": 4457,
      "step_time": 27.780203569680452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 124.5,
      "completions/mean_terminated_length": 124.5,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2856317386031151,
      "epoch": 0.20648448355720242,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004135873634368181,
      "kl": 0.002742825250606984,
      "learning_rate": 9.587123668364984e-07,
      "loss": 0.0001,
      "num_tokens": 122652542.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4458,
      "step_time": 15.610228545963764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 227.5,
      "completions/mean_terminated_length": 227.5,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.17769036814570427,
      "epoch": 0.20653080129689672,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034338689874857664,
      "kl": 0.0032599986298009753,
      "learning_rate": 9.587031032885595e-07,
      "loss": 0.0002,
      "num_tokens": 122681958.0,
      "reward": 0.9489824771881104,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9489824771881104,
      "rewards/reward_func/std": 0.0,
      "step": 4459,
      "step_time": 27.415871299803257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 129.8125,
      "completions/mean_terminated_length": 129.8125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.27154625207185745,
      "epoch": 0.206577119036591,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002972422167658806,
      "kl": 0.0022357859706971794,
      "learning_rate": 9.586938397406206e-07,
      "loss": 0.0001,
      "num_tokens": 122705427.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4460,
      "step_time": 17.25046358630061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 147.875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.24585303664207458,
      "epoch": 0.2066234367762853,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21688760817050934,
      "kl": 0.011696155939716846,
      "learning_rate": 9.586845761926818e-07,
      "loss": -0.0442,
      "num_tokens": 122725617.0,
      "reward": 0.7421827912330627,
      "reward_std": 0.06671953946352005,
      "rewards/reward_func/mean": 0.7421827912330627,
      "rewards/reward_func/std": 0.06671953201293945,
      "step": 4461,
      "step_time": 18.486302569508553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 291.0,
      "completions/max_terminated_length": 291.0,
      "completions/mean_length": 241.8125,
      "completions/mean_terminated_length": 241.8125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "entropy": 0.45322611927986145,
      "epoch": 0.20666975451597963,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11351599544286728,
      "kl": 0.00714661309029907,
      "learning_rate": 9.586753126447429e-07,
      "loss": 0.1097,
      "num_tokens": 122763326.0,
      "reward": 0.4375,
      "reward_std": 0.5123475193977356,
      "rewards/reward_func/mean": 0.4375,
      "rewards/reward_func/std": 0.5123475790023804,
      "step": 4462,
      "step_time": 34.19764931499958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 192.3125,
      "completions/mean_terminated_length": 192.3125,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.44985658675432205,
      "epoch": 0.20671607225567393,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12578284740447998,
      "kl": 0.005065404460765421,
      "learning_rate": 9.58666049096804e-07,
      "loss": 0.0635,
      "num_tokens": 122797507.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4463,
      "step_time": 25.71725757792592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 187.4375,
      "completions/mean_terminated_length": 187.4375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2840280532836914,
      "epoch": 0.20676238999536822,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009209983982145786,
      "kl": 0.009957100730389357,
      "learning_rate": 9.586567855488651e-07,
      "loss": 0.0005,
      "num_tokens": 122818714.0,
      "reward": 0.9021315574645996,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9021315574645996,
      "rewards/reward_func/std": 0.0,
      "step": 4464,
      "step_time": 23.85198138281703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 240.1875,
      "completions/mean_terminated_length": 240.1875,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "entropy": 0.23530149459838867,
      "epoch": 0.20680870773506252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0069038900546729565,
      "kl": 0.00827450561337173,
      "learning_rate": 9.586475220009263e-07,
      "loss": 0.0004,
      "num_tokens": 122842813.0,
      "reward": 0.9775290489196777,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9775290489196777,
      "rewards/reward_func/std": 0.0,
      "step": 4465,
      "step_time": 25.949667435139418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 302.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 270.1875,
      "completions/mean_terminated_length": 270.1875,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "entropy": 0.22654571384191513,
      "epoch": 0.20685502547475684,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006940394174307585,
      "kl": 0.004943744745105505,
      "learning_rate": 9.586382584529874e-07,
      "loss": 0.0002,
      "num_tokens": 122866544.0,
      "reward": 0.686811089515686,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.686811089515686,
      "rewards/reward_func/std": 0.0,
      "step": 4466,
      "step_time": 27.922365579754114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 139.75,
      "completions/mean_terminated_length": 139.75,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.32575297355651855,
      "epoch": 0.20690134321445114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027997978031635284,
      "kl": 0.002147680992493406,
      "learning_rate": 9.586289949050485e-07,
      "loss": 0.0001,
      "num_tokens": 122890428.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4467,
      "step_time": 17.359182715415955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 254.125,
      "completions/mean_terminated_length": 254.125,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "entropy": 0.20986905321478844,
      "epoch": 0.20694766095414543,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08784881979227066,
      "kl": 0.004571834113448858,
      "learning_rate": 9.586197313571096e-07,
      "loss": 0.0031,
      "num_tokens": 122914446.0,
      "reward": 0.7982771396636963,
      "reward_std": 0.031642720103263855,
      "rewards/reward_func/mean": 0.7982771396636963,
      "rewards/reward_func/std": 0.03164271265268326,
      "step": 4468,
      "step_time": 26.15432145446539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 150.875,
      "completions/mean_terminated_length": 150.875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3842119202017784,
      "epoch": 0.20699397869383973,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001620774739421904,
      "kl": 0.002083538507577032,
      "learning_rate": 9.58610467809171e-07,
      "loss": 0.0001,
      "num_tokens": 122972572.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4469,
      "step_time": 27.610013004392385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 157.3125,
      "completions/mean_terminated_length": 157.3125,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.19669905677437782,
      "epoch": 0.20704029643353405,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016642799600958824,
      "kl": 0.0013919929624535143,
      "learning_rate": 9.58601204261232e-07,
      "loss": 0.0001,
      "num_tokens": 122993361.0,
      "reward": 0.9131007194519043,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9131007194519043,
      "rewards/reward_func/std": 0.0,
      "step": 4470,
      "step_time": 20.828556288033724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 367.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 313.0625,
      "completions/mean_terminated_length": 313.0625,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "entropy": 0.18001729249954224,
      "epoch": 0.20708661417322835,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019139184150844812,
      "kl": 0.002082289836835116,
      "learning_rate": 9.585919407132932e-07,
      "loss": 0.0001,
      "num_tokens": 123027490.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4471,
      "step_time": 35.17898824065924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 202.3125,
      "completions/mean_terminated_length": 202.3125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.31132566928863525,
      "epoch": 0.20713293191292265,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10304898768663406,
      "kl": 0.016367219388484955,
      "learning_rate": 9.585826771653541e-07,
      "loss": 0.0174,
      "num_tokens": 123049959.0,
      "reward": 0.969406247138977,
      "reward_std": 0.0835980772972107,
      "rewards/reward_func/mean": 0.969406247138977,
      "rewards/reward_func/std": 0.08359809219837189,
      "step": 4472,
      "step_time": 26.043080002069473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 121.0625,
      "completions/mean_terminated_length": 121.0625,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.32665877044200897,
      "epoch": 0.20717924965261694,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003180100815370679,
      "kl": 0.0021426203893497586,
      "learning_rate": 9.585734136174155e-07,
      "loss": 0.0001,
      "num_tokens": 123078232.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4473,
      "step_time": 16.59187662974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "entropy": 0.4231932908296585,
      "epoch": 0.20722556739231127,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037095234729349613,
      "kl": 0.0027809487655758858,
      "learning_rate": 9.585641500694766e-07,
      "loss": 0.0001,
      "num_tokens": 123120454.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4474,
      "step_time": 24.51600181683898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 180.125,
      "completions/mean_terminated_length": 180.125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "entropy": 0.3709437772631645,
      "epoch": 0.20727188513200556,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024438004475086927,
      "kl": 0.002327405847609043,
      "learning_rate": 9.585548865215377e-07,
      "loss": 0.0001,
      "num_tokens": 123148056.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4475,
      "step_time": 23.199911538511515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 123.375,
      "completions/mean_terminated_length": 123.375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.3465873748064041,
      "epoch": 0.20731820287169986,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034997561015188694,
      "kl": 0.00299051619367674,
      "learning_rate": 9.585456229735989e-07,
      "loss": 0.0001,
      "num_tokens": 123172318.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4476,
      "step_time": 17.120753288269043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 171.8125,
      "completions/mean_terminated_length": 171.8125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.42620351910591125,
      "epoch": 0.20736452061139415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006532012950628996,
      "kl": 0.0044926885748282075,
      "learning_rate": 9.5853635942566e-07,
      "loss": 0.0002,
      "num_tokens": 123209963.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4477,
      "step_time": 25.134059321135283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 135.4375,
      "completions/mean_terminated_length": 135.4375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.21477627009153366,
      "epoch": 0.20741083835108848,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002007042523473501,
      "kl": 0.00134053552756086,
      "learning_rate": 9.58527095877721e-07,
      "loss": 0.0001,
      "num_tokens": 123229570.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4478,
      "step_time": 16.767207119613886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 140.125,
      "completions/mean_terminated_length": 140.125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.27937763184309006,
      "epoch": 0.20745715609078277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005686054937541485,
      "kl": 0.003374650375917554,
      "learning_rate": 9.585178323297822e-07,
      "loss": 0.0002,
      "num_tokens": 123249396.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4479,
      "step_time": 17.23304009065032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 132.0,
      "completions/mean_terminated_length": 132.0,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2886121869087219,
      "epoch": 0.20750347383047707,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003832530463114381,
      "kl": 0.0024614507565274835,
      "learning_rate": 9.585085687818434e-07,
      "loss": 0.0001,
      "num_tokens": 123269956.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4480,
      "step_time": 17.470990426838398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 204.625,
      "completions/mean_terminated_length": 204.625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.2175096571445465,
      "epoch": 0.20754979157017137,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11134587228298187,
      "kl": 0.027105043176561594,
      "learning_rate": 9.584993052339045e-07,
      "loss": -0.0259,
      "num_tokens": 123295918.0,
      "reward": 0.7903306484222412,
      "reward_std": 0.21662232279777527,
      "rewards/reward_func/mean": 0.7903306484222412,
      "rewards/reward_func/std": 0.21662232279777527,
      "step": 4481,
      "step_time": 23.024387542158365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 191.875,
      "completions/mean_terminated_length": 191.875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.39290447533130646,
      "epoch": 0.2075961093098657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021612225100398064,
      "kl": 0.009134544990956783,
      "learning_rate": 9.584900416859656e-07,
      "loss": 0.0005,
      "num_tokens": 123331532.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4482,
      "step_time": 26.628085043281317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 204.3125,
      "completions/mean_terminated_length": 204.3125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.2346363514661789,
      "epoch": 0.20764242704955999,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025011177640408278,
      "kl": 0.0022415997518692166,
      "learning_rate": 9.58480778138027e-07,
      "loss": 0.0001,
      "num_tokens": 123369057.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 4483,
      "step_time": 27.2366630025208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 126.5,
      "completions/mean_terminated_length": 126.5,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.28307194262742996,
      "epoch": 0.20768874478925428,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002643357031047344,
      "kl": 0.0018845770973712206,
      "learning_rate": 9.58471514590088e-07,
      "loss": 0.0001,
      "num_tokens": 123395449.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4484,
      "step_time": 17.27096877992153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 170.625,
      "completions/mean_terminated_length": 170.625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.16094931587576866,
      "epoch": 0.20773506252894858,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005039875395596027,
      "kl": 0.004182444128673524,
      "learning_rate": 9.58462251042149e-07,
      "loss": 0.0002,
      "num_tokens": 123418963.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4485,
      "step_time": 19.453738171607256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.18804769590497017,
      "epoch": 0.2077813802686429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007854282855987549,
      "kl": 0.05850925948470831,
      "learning_rate": 9.584529874942103e-07,
      "loss": 0.0029,
      "num_tokens": 123441428.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4486,
      "step_time": 23.984460175037384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.0,
      "completions/max_terminated_length": 139.0,
      "completions/mean_length": 119.3125,
      "completions/mean_terminated_length": 119.3125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.29421278089284897,
      "epoch": 0.2078276980083372,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004592785611748695,
      "kl": 0.0025236682558897883,
      "learning_rate": 9.584437239462714e-07,
      "loss": 0.0001,
      "num_tokens": 123463369.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4487,
      "step_time": 16.386843316257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 394.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 365.6875,
      "completions/mean_terminated_length": 365.6875,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "entropy": 0.16563784703612328,
      "epoch": 0.2078740157480315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003584612160921097,
      "kl": 0.00325860851444304,
      "learning_rate": 9.584344603983326e-07,
      "loss": 0.0002,
      "num_tokens": 123492132.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4488,
      "step_time": 36.691088780760765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 148.1875,
      "completions/mean_terminated_length": 148.1875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.33117473870515823,
      "epoch": 0.2079203334877258,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15245525538921356,
      "kl": 0.01356105669401586,
      "learning_rate": 9.584251968503937e-07,
      "loss": -0.0058,
      "num_tokens": 123513639.0,
      "reward": 0.042260896414518356,
      "reward_std": 0.16904358565807343,
      "rewards/reward_func/mean": 0.042260896414518356,
      "rewards/reward_func/std": 0.16904358565807343,
      "step": 4489,
      "step_time": 17.898266334086657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 115.0,
      "completions/mean_terminated_length": 115.0,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.26029717922210693,
      "epoch": 0.2079666512274201,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014157130382955074,
      "kl": 0.006502093398012221,
      "learning_rate": 9.584159333024548e-07,
      "loss": 0.0003,
      "num_tokens": 123533623.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4490,
      "step_time": 14.306664638221264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 131.75,
      "completions/mean_terminated_length": 131.75,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.31367746740579605,
      "epoch": 0.2080129689671144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00331858335994184,
      "kl": 0.002797766006551683,
      "learning_rate": 9.58406669754516e-07,
      "loss": 0.0001,
      "num_tokens": 123557107.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4491,
      "step_time": 17.110287327319384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 138.1875,
      "completions/mean_terminated_length": 138.1875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.1890847384929657,
      "epoch": 0.2080592867068087,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013698014663532376,
      "kl": 0.001124966685893014,
      "learning_rate": 9.58397406206577e-07,
      "loss": 0.0001,
      "num_tokens": 123576918.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4492,
      "step_time": 15.461940791457891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 161.5,
      "completions/mean_terminated_length": 161.5,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.2741554155945778,
      "epoch": 0.208105604446503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004652024712413549,
      "kl": 0.0028163317474536598,
      "learning_rate": 9.583881426586382e-07,
      "loss": 0.0001,
      "num_tokens": 123599566.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4493,
      "step_time": 20.036991473287344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 160.6875,
      "completions/mean_terminated_length": 160.6875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2065758854150772,
      "epoch": 0.20815192218619732,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038431889843195677,
      "kl": 0.0022339446586556733,
      "learning_rate": 9.583788791106993e-07,
      "loss": 0.0001,
      "num_tokens": 123636953.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4494,
      "step_time": 22.542246356606483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 154.625,
      "completions/mean_terminated_length": 154.625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.18167023360729218,
      "epoch": 0.20819823992589162,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12746275961399078,
      "kl": 0.02427427435759455,
      "learning_rate": 9.583696155627604e-07,
      "loss": 0.0071,
      "num_tokens": 123661843.0,
      "reward": 0.9681414365768433,
      "reward_std": 0.015806283801794052,
      "rewards/reward_func/mean": 0.9681414365768433,
      "rewards/reward_func/std": 0.015806281939148903,
      "step": 4495,
      "step_time": 18.49478606879711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 114.6875,
      "completions/mean_terminated_length": 114.6875,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.24950814619660378,
      "epoch": 0.20824455766558592,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004354960285127163,
      "kl": 0.0026552329654805362,
      "learning_rate": 9.583603520148218e-07,
      "loss": 0.0001,
      "num_tokens": 123681374.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4496,
      "step_time": 15.170330747961998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 116.1875,
      "completions/mean_terminated_length": 116.1875,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.31312163919210434,
      "epoch": 0.2082908754052802,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005197524558752775,
      "kl": 0.0026675930712372065,
      "learning_rate": 9.583510884668827e-07,
      "loss": 0.0001,
      "num_tokens": 123701777.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4497,
      "step_time": 16.79928321763873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 138.6875,
      "completions/mean_terminated_length": 138.6875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3549370616674423,
      "epoch": 0.20833719314497454,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003198597813025117,
      "kl": 0.002531467005610466,
      "learning_rate": 9.583418249189438e-07,
      "loss": 0.0001,
      "num_tokens": 123729708.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4498,
      "step_time": 17.87234526500106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 129.9375,
      "completions/mean_terminated_length": 129.9375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.32815539091825485,
      "epoch": 0.20838351088466883,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026750012766569853,
      "kl": 0.0021164837235119194,
      "learning_rate": 9.583325613710051e-07,
      "loss": 0.0001,
      "num_tokens": 123755035.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4499,
      "step_time": 16.812711495906115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 128.5,
      "completions/mean_terminated_length": 128.5,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3110589236021042,
      "epoch": 0.20842982862436313,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036182873882353306,
      "kl": 0.002530342200770974,
      "learning_rate": 9.583232978230663e-07,
      "loss": 0.0001,
      "num_tokens": 123780483.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4500,
      "step_time": 16.715143274515867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 125.0,
      "completions/mean_terminated_length": 125.0,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.308862529695034,
      "epoch": 0.20847614636405742,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020533078350126743,
      "kl": 0.00184511041152291,
      "learning_rate": 9.583140342751274e-07,
      "loss": 0.0001,
      "num_tokens": 123807427.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4501,
      "step_time": 17.075216569006443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 206.75,
      "completions/mean_terminated_length": 206.75,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.16802659630775452,
      "epoch": 0.20852246410375175,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00467786705121398,
      "kl": 0.0035754297859966755,
      "learning_rate": 9.583047707271885e-07,
      "loss": 0.0002,
      "num_tokens": 123834015.0,
      "reward": 0.6246347427368164,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6246347427368164,
      "rewards/reward_func/std": 0.0,
      "step": 4502,
      "step_time": 23.39074996113777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 189.3125,
      "completions/mean_terminated_length": 189.3125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.4275503605604172,
      "epoch": 0.20856878184344604,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12397279590368271,
      "kl": 0.017243289854377508,
      "learning_rate": 9.582955071792496e-07,
      "loss": -0.0971,
      "num_tokens": 123856804.0,
      "reward": 0.08718213438987732,
      "reward_std": 0.23822695016860962,
      "rewards/reward_func/mean": 0.08718213438987732,
      "rewards/reward_func/std": 0.23822695016860962,
      "step": 4503,
      "step_time": 23.68745656311512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 126.6875,
      "completions/mean_terminated_length": 126.6875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2466145195066929,
      "epoch": 0.20861509958314034,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025737672112882137,
      "kl": 0.002056895347777754,
      "learning_rate": 9.582862436313108e-07,
      "loss": 0.0001,
      "num_tokens": 123882271.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4504,
      "step_time": 16.518370371311903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 199.25,
      "completions/mean_terminated_length": 199.25,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.3941348195075989,
      "epoch": 0.20866141732283464,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003663417650386691,
      "kl": 0.003953707404434681,
      "learning_rate": 9.58276980083372e-07,
      "loss": 0.0002,
      "num_tokens": 123904419.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4505,
      "step_time": 23.864782005548477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 125.25,
      "completions/mean_terminated_length": 125.25,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 0.2710713595151901,
      "epoch": 0.20870773506252896,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00256364862434566,
      "kl": 0.001976566738449037,
      "learning_rate": 9.58267716535433e-07,
      "loss": 0.0001,
      "num_tokens": 123927511.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4506,
      "step_time": 16.101536702364683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 212.0,
      "completions/max_terminated_length": 212.0,
      "completions/mean_length": 178.75,
      "completions/mean_terminated_length": 178.75,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.2687610983848572,
      "epoch": 0.20875405280222326,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13785500824451447,
      "kl": 0.029831380117684603,
      "learning_rate": 9.582584529874941e-07,
      "loss": -0.0317,
      "num_tokens": 123964499.0,
      "reward": 0.7785302996635437,
      "reward_std": 0.17717573046684265,
      "rewards/reward_func/mean": 0.7785302996635437,
      "rewards/reward_func/std": 0.17717573046684265,
      "step": 4507,
      "step_time": 25.48934479802847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 160.0,
      "completions/mean_terminated_length": 160.0,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.32828326523303986,
      "epoch": 0.20880037054191755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029990829061716795,
      "kl": 0.0021405020961537957,
      "learning_rate": 9.582491894395553e-07,
      "loss": 0.0001,
      "num_tokens": 123992707.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4508,
      "step_time": 20.664042081683874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 204.5625,
      "completions/mean_terminated_length": 204.5625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.36476946622133255,
      "epoch": 0.20884668828161185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011225164867937565,
      "kl": 0.0076727664563804865,
      "learning_rate": 9.582399258916164e-07,
      "loss": 0.0004,
      "num_tokens": 124025036.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4509,
      "step_time": 27.071549084037542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 118.0,
      "completions/mean_terminated_length": 118.0,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.2992003932595253,
      "epoch": 0.20889300602130617,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0068072606809437275,
      "kl": 0.0032806770759634674,
      "learning_rate": 9.582306623436775e-07,
      "loss": 0.0002,
      "num_tokens": 124044988.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4510,
      "step_time": 15.329447764903307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 116.6875,
      "completions/mean_terminated_length": 116.6875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.3145041763782501,
      "epoch": 0.20893932376100047,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037256877403706312,
      "kl": 0.0029680809238925576,
      "learning_rate": 9.582213987957386e-07,
      "loss": 0.0001,
      "num_tokens": 124067719.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4511,
      "step_time": 15.804791938513517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 135.125,
      "completions/mean_terminated_length": 135.125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.25249340385198593,
      "epoch": 0.20898564150069476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004193861503154039,
      "kl": 0.0022256918309722096,
      "learning_rate": 9.582121352477998e-07,
      "loss": 0.0001,
      "num_tokens": 124088521.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4512,
      "step_time": 16.926363054662943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.29557929188013077,
      "epoch": 0.20903195924038906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004239782691001892,
      "kl": 0.03148888412397355,
      "learning_rate": 9.582028716998611e-07,
      "loss": 0.0015,
      "num_tokens": 124121779.0,
      "reward": 0.26359713077545166,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.26359713077545166,
      "rewards/reward_func/std": 0.0,
      "step": 4513,
      "step_time": 20.87648806348443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 212.0,
      "completions/mean_terminated_length": 212.0,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.3210453614592552,
      "epoch": 0.20907827698008338,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11319698393344879,
      "kl": 0.022904083132743835,
      "learning_rate": 9.581936081519222e-07,
      "loss": -0.0202,
      "num_tokens": 124143859.0,
      "reward": 0.9328369498252869,
      "reward_std": 0.24945172667503357,
      "rewards/reward_func/mean": 0.9328369498252869,
      "rewards/reward_func/std": 0.24945174157619476,
      "step": 4514,
      "step_time": 34.970582257956266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 187.8125,
      "completions/mean_terminated_length": 187.8125,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.2327510304749012,
      "epoch": 0.20912459471977768,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006550724618136883,
      "kl": 0.006306188879534602,
      "learning_rate": 9.581843446039831e-07,
      "loss": 0.0003,
      "num_tokens": 124171360.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4515,
      "step_time": 22.35580413416028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 126.8125,
      "completions/mean_terminated_length": 126.8125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.29604586213827133,
      "epoch": 0.20917091245947197,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019563401583582163,
      "kl": 0.0016671387711539865,
      "learning_rate": 9.581750810560445e-07,
      "loss": 0.0001,
      "num_tokens": 124194749.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4516,
      "step_time": 16.582060985267162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 175.3125,
      "completions/mean_terminated_length": 175.3125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.2425302192568779,
      "epoch": 0.20921723019916627,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023137491662055254,
      "kl": 0.0021871365315746516,
      "learning_rate": 9.581658175081056e-07,
      "loss": 0.0001,
      "num_tokens": 124230434.0,
      "reward": 0.11362193524837494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.11362193524837494,
      "rewards/reward_func/std": 0.0,
      "step": 4517,
      "step_time": 24.648380033671856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 197.9375,
      "completions/mean_terminated_length": 197.9375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.1759553737938404,
      "epoch": 0.2092635479388606,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005160289816558361,
      "kl": 0.003777960315346718,
      "learning_rate": 9.581565539601667e-07,
      "loss": 0.0002,
      "num_tokens": 124259857.0,
      "reward": 0.8131037354469299,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8131037354469299,
      "rewards/reward_func/std": 0.0,
      "step": 4518,
      "step_time": 23.335392862558365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 173.125,
      "completions/mean_terminated_length": 173.125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.40470677614212036,
      "epoch": 0.2093098656785549,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006334818433970213,
      "kl": 0.005239688558503985,
      "learning_rate": 9.581472904122279e-07,
      "loss": 0.0003,
      "num_tokens": 124281507.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4519,
      "step_time": 20.79850087314844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 174.0,
      "completions/mean_terminated_length": 174.0,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.19005835056304932,
      "epoch": 0.2093561834182492,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14783187210559845,
      "kl": 0.032623309176415205,
      "learning_rate": 9.58138026864289e-07,
      "loss": -0.0332,
      "num_tokens": 124305155.0,
      "reward": 0.6320604085922241,
      "reward_std": 0.22408708930015564,
      "rewards/reward_func/mean": 0.6320604085922241,
      "rewards/reward_func/std": 0.22408710420131683,
      "step": 4520,
      "step_time": 22.775241017341614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 200.5,
      "completions/mean_terminated_length": 200.5,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.39643463492393494,
      "epoch": 0.20940250115794348,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09006068110466003,
      "kl": 0.004479829920455813,
      "learning_rate": 9.581287633163501e-07,
      "loss": 0.0988,
      "num_tokens": 124327819.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4521,
      "step_time": 26.601082772016525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 261.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 195.75,
      "completions/mean_terminated_length": 195.75,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.4853120595216751,
      "epoch": 0.2094488188976378,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1422317773103714,
      "kl": 0.009126980789005756,
      "learning_rate": 9.581194997684112e-07,
      "loss": 0.1064,
      "num_tokens": 124353111.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4522,
      "step_time": 26.04064156487584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 194.1875,
      "completions/mean_terminated_length": 194.1875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.40737374126911163,
      "epoch": 0.2094951366373321,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006968447007238865,
      "kl": 0.005220564664341509,
      "learning_rate": 9.581102362204724e-07,
      "loss": 0.0003,
      "num_tokens": 124376570.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4523,
      "step_time": 23.655402705073357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 147.8125,
      "completions/mean_terminated_length": 147.8125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.21773340553045273,
      "epoch": 0.2095414543770264,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028009614907205105,
      "kl": 0.0018515900010243058,
      "learning_rate": 9.581009726725335e-07,
      "loss": 0.0001,
      "num_tokens": 124396759.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4524,
      "step_time": 17.065115593373775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 139.4375,
      "completions/mean_terminated_length": 139.4375,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.29620032012462616,
      "epoch": 0.2095877721167207,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002556238789111376,
      "kl": 0.001933709834702313,
      "learning_rate": 9.580917091245946e-07,
      "loss": 0.0001,
      "num_tokens": 124432750.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4525,
      "step_time": 20.982255693525076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 154.8125,
      "completions/mean_terminated_length": 154.8125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.24403830617666245,
      "epoch": 0.20963408985641502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006410935427993536,
      "kl": 0.005382670904509723,
      "learning_rate": 9.58082445576656e-07,
      "loss": 0.0003,
      "num_tokens": 124459291.0,
      "reward": 0.3384654223918915,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3384654223918915,
      "rewards/reward_func/std": 0.0,
      "step": 4526,
      "step_time": 20.046632915735245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 171.25,
      "completions/mean_terminated_length": 171.25,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.37928277254104614,
      "epoch": 0.20968040759610931,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021629396826028824,
      "kl": 0.004303328867536038,
      "learning_rate": 9.58073182028717e-07,
      "loss": 0.0002,
      "num_tokens": 124494271.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4527,
      "step_time": 22.399007219821215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 125.375,
      "completions/mean_terminated_length": 125.375,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.28371690958738327,
      "epoch": 0.2097267253358036,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023662883322685957,
      "kl": 0.0022507273824885488,
      "learning_rate": 9.58063918480778e-07,
      "loss": 0.0001,
      "num_tokens": 124515365.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4528,
      "step_time": 17.098990455269814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 154.6875,
      "completions/mean_terminated_length": 154.6875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.4401158168911934,
      "epoch": 0.2097730430754979,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002236809115856886,
      "kl": 0.0022928886755835265,
      "learning_rate": 9.580546549328393e-07,
      "loss": 0.0001,
      "num_tokens": 124559936.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4529,
      "step_time": 24.50910273194313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 251.0,
      "completions/max_terminated_length": 251.0,
      "completions/mean_length": 191.75,
      "completions/mean_terminated_length": 191.75,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.24911126121878624,
      "epoch": 0.20981936081519223,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07972320914268494,
      "kl": 0.0025148409185931087,
      "learning_rate": 9.580453913849004e-07,
      "loss": -0.0235,
      "num_tokens": 124586892.0,
      "reward": 0.9431997537612915,
      "reward_std": 0.015146732330322266,
      "rewards/reward_func/mean": 0.9431997537612915,
      "rewards/reward_func/std": 0.015146732330322266,
      "step": 4530,
      "step_time": 26.23358115181327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 129.3125,
      "completions/mean_terminated_length": 129.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.3201339915394783,
      "epoch": 0.20986567855488653,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006610604003071785,
      "kl": 0.0034712717751972377,
      "learning_rate": 9.580361278369616e-07,
      "loss": 0.0002,
      "num_tokens": 124606897.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4531,
      "step_time": 15.311695747077465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.1992301195859909,
      "epoch": 0.20991199629458082,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003081638365983963,
      "kl": 0.0017424341931473464,
      "learning_rate": 9.580268642890227e-07,
      "loss": 0.0001,
      "num_tokens": 124626314.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4532,
      "step_time": 16.176436487585306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 188.0625,
      "completions/mean_terminated_length": 188.0625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3342147395014763,
      "epoch": 0.20995831403427512,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0051155127584934235,
      "kl": 0.003918955335393548,
      "learning_rate": 9.580176007410838e-07,
      "loss": 0.0002,
      "num_tokens": 124655371.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4533,
      "step_time": 24.33262361958623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 164.5625,
      "completions/mean_terminated_length": 164.5625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.4239857494831085,
      "epoch": 0.21000463177396944,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022273610811680555,
      "kl": 0.0020079879905097187,
      "learning_rate": 9.58008337193145e-07,
      "loss": 0.0001,
      "num_tokens": 124695780.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4534,
      "step_time": 23.14905248582363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 135.5,
      "completions/mean_terminated_length": 135.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.35153312236070633,
      "epoch": 0.21005094951366374,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003029546467587352,
      "kl": 0.002299476764164865,
      "learning_rate": 9.57999073645206e-07,
      "loss": 0.0001,
      "num_tokens": 124730828.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4535,
      "step_time": 19.38253891095519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 149.5,
      "completions/mean_terminated_length": 149.5,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.37292734533548355,
      "epoch": 0.21009726725335803,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004241833463311195,
      "kl": 0.0028279765392653644,
      "learning_rate": 9.579898100972672e-07,
      "loss": 0.0001,
      "num_tokens": 124766836.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4536,
      "step_time": 22.115824546664953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 155.625,
      "completions/mean_terminated_length": 155.625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.2844870761036873,
      "epoch": 0.21014358499305233,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1563979685306549,
      "kl": 0.009537339094094932,
      "learning_rate": 9.579805465493283e-07,
      "loss": 0.0557,
      "num_tokens": 124787806.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4537,
      "step_time": 22.45786213129759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 133.0,
      "completions/mean_terminated_length": 133.0,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.26435963809490204,
      "epoch": 0.21018990273274665,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027738655917346478,
      "kl": 0.0017964460421353579,
      "learning_rate": 9.579712830013894e-07,
      "loss": 0.0001,
      "num_tokens": 124808974.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4538,
      "step_time": 16.81413860619068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 134.875,
      "completions/mean_terminated_length": 134.875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.30298901349306107,
      "epoch": 0.21023622047244095,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004321299027651548,
      "kl": 0.0024628351093269885,
      "learning_rate": 9.579620194534508e-07,
      "loss": 0.0001,
      "num_tokens": 124831916.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4539,
      "step_time": 16.509110625833273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 145.1875,
      "completions/mean_terminated_length": 145.1875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.45492929965257645,
      "epoch": 0.21028253821213524,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024500072468072176,
      "kl": 0.002628411049954593,
      "learning_rate": 9.579527559055117e-07,
      "loss": 0.0001,
      "num_tokens": 124873775.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4540,
      "step_time": 23.426053818315268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 254.0,
      "completions/max_terminated_length": 254.0,
      "completions/mean_length": 222.3125,
      "completions/mean_terminated_length": 222.3125,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.45737026631832123,
      "epoch": 0.21032885595182954,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08502742648124695,
      "kl": 0.005424272269010544,
      "learning_rate": 9.579434923575728e-07,
      "loss": 0.0314,
      "num_tokens": 124895828.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4541,
      "step_time": 25.104504711925983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 135.8125,
      "completions/mean_terminated_length": 135.8125,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.22275033220648766,
      "epoch": 0.21037517369152386,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028084134683012962,
      "kl": 0.0016665546572767198,
      "learning_rate": 9.57934228809634e-07,
      "loss": 0.0001,
      "num_tokens": 124915505.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4542,
      "step_time": 17.30308758467436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 137.625,
      "completions/mean_terminated_length": 137.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.34233835339546204,
      "epoch": 0.21042149143121816,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034828847274184227,
      "kl": 0.0029158778488636017,
      "learning_rate": 9.579249652616953e-07,
      "loss": 0.0001,
      "num_tokens": 124951627.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4543,
      "step_time": 21.10429633408785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 196.375,
      "completions/mean_terminated_length": 196.375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.2279246486723423,
      "epoch": 0.21046780917091246,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1677601933479309,
      "kl": 0.03046353254467249,
      "learning_rate": 9.579157017137564e-07,
      "loss": -0.0139,
      "num_tokens": 124976257.0,
      "reward": 0.8965252637863159,
      "reward_std": 0.1090937927365303,
      "rewards/reward_func/mean": 0.8965252637863159,
      "rewards/reward_func/std": 0.1090938076376915,
      "step": 4544,
      "step_time": 22.578668504953384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 230.5,
      "completions/mean_terminated_length": 230.5,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.3844281882047653,
      "epoch": 0.21051412691060675,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09906401485204697,
      "kl": 0.009228316484950483,
      "learning_rate": 9.579064381658175e-07,
      "loss": 0.0533,
      "num_tokens": 125005721.0,
      "reward": 0.23867067694664001,
      "reward_std": 0.42886680364608765,
      "rewards/reward_func/mean": 0.23867067694664001,
      "rewards/reward_func/std": 0.42886680364608765,
      "step": 4545,
      "step_time": 32.811641447246075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 150.0,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.1901368945837021,
      "epoch": 0.21056044465030108,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007037638686597347,
      "kl": 0.005093779705930501,
      "learning_rate": 9.578971746178787e-07,
      "loss": 0.0003,
      "num_tokens": 125027913.0,
      "reward": 0.3441537916660309,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3441537916660309,
      "rewards/reward_func/std": 0.0,
      "step": 4546,
      "step_time": 20.51408862695098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 190.375,
      "completions/mean_terminated_length": 190.375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.3855598568916321,
      "epoch": 0.21060676238999537,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007795747369527817,
      "kl": 0.006330879288725555,
      "learning_rate": 9.578879110699398e-07,
      "loss": 0.0003,
      "num_tokens": 125051711.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4547,
      "step_time": 22.817515335977077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 178.3125,
      "completions/mean_terminated_length": 178.3125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.42228440940380096,
      "epoch": 0.21065308012968967,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006360540632158518,
      "kl": 0.004638633807189763,
      "learning_rate": 9.57878647522001e-07,
      "loss": 0.0002,
      "num_tokens": 125077860.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4548,
      "step_time": 22.673727177083492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 130.25,
      "completions/mean_terminated_length": 130.25,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3095518723130226,
      "epoch": 0.21069939786938396,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042094760574400425,
      "kl": 0.002994225302245468,
      "learning_rate": 9.57869383974062e-07,
      "loss": 0.0001,
      "num_tokens": 125100072.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4549,
      "step_time": 16.402844041585922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 126.6875,
      "completions/mean_terminated_length": 126.6875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.29624032229185104,
      "epoch": 0.2107457156090783,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017073858762159944,
      "kl": 0.001342095754807815,
      "learning_rate": 9.578601204261232e-07,
      "loss": 0.0001,
      "num_tokens": 125121091.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4550,
      "step_time": 15.18632896989584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 143.1875,
      "completions/mean_terminated_length": 143.1875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.33111632615327835,
      "epoch": 0.21079203334877258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001679004286415875,
      "kl": 0.0017345676897093654,
      "learning_rate": 9.578508568781843e-07,
      "loss": 0.0001,
      "num_tokens": 125143910.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4551,
      "step_time": 19.42225407063961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 167.25,
      "completions/mean_terminated_length": 167.25,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.2126971296966076,
      "epoch": 0.21083835108846688,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10637318342924118,
      "kl": 0.005246924585662782,
      "learning_rate": 9.578415933302454e-07,
      "loss": 0.0236,
      "num_tokens": 125165690.0,
      "reward": 0.9966224431991577,
      "reward_std": 0.013510131277143955,
      "rewards/reward_func/mean": 0.9966224431991577,
      "rewards/reward_func/std": 0.013510138727724552,
      "step": 4552,
      "step_time": 18.213520001620054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 116.6875,
      "completions/mean_terminated_length": 116.6875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.29982779920101166,
      "epoch": 0.21088466882816118,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036096482072025537,
      "kl": 0.00243820488685742,
      "learning_rate": 9.578323297823065e-07,
      "loss": 0.0001,
      "num_tokens": 125186293.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4553,
      "step_time": 16.36350916698575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 351.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 295.0625,
      "completions/mean_terminated_length": 295.0625,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "entropy": 0.18046902120113373,
      "epoch": 0.2109309865678555,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0981452465057373,
      "kl": 0.021709760883823037,
      "learning_rate": 9.578230662343677e-07,
      "loss": -0.0837,
      "num_tokens": 125226966.0,
      "reward": 0.834690272808075,
      "reward_std": 0.1847260296344757,
      "rewards/reward_func/mean": 0.834690272808075,
      "rewards/reward_func/std": 0.1847260296344757,
      "step": 4554,
      "step_time": 36.709081299602985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 233.0,
      "completions/max_terminated_length": 233.0,
      "completions/mean_length": 200.5,
      "completions/mean_terminated_length": 200.5,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.2706534266471863,
      "epoch": 0.2109773043075498,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003649881575256586,
      "kl": 0.011807393282651901,
      "learning_rate": 9.578138026864288e-07,
      "loss": 0.0006,
      "num_tokens": 125261310.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4555,
      "step_time": 24.966287799179554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 138.75,
      "completions/mean_terminated_length": 138.75,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.26266178488731384,
      "epoch": 0.2110236220472441,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003660748712718487,
      "kl": 0.0018841018609236926,
      "learning_rate": 9.578045391384901e-07,
      "loss": 0.0001,
      "num_tokens": 125282730.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4556,
      "step_time": 17.017644729465246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 170.4375,
      "completions/mean_terminated_length": 170.4375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.25271132960915565,
      "epoch": 0.2110699397869384,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14928926527500153,
      "kl": 0.021795076318085194,
      "learning_rate": 9.577952755905512e-07,
      "loss": 0.0029,
      "num_tokens": 125321665.0,
      "reward": 0.7160109281539917,
      "reward_std": 0.16633981466293335,
      "rewards/reward_func/mean": 0.7160109281539917,
      "rewards/reward_func/std": 0.16633982956409454,
      "step": 4557,
      "step_time": 24.311111871153116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 173.375,
      "completions/mean_terminated_length": 173.375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3092428520321846,
      "epoch": 0.2111162575266327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042096651159226894,
      "kl": 0.003150644537527114,
      "learning_rate": 9.577860120426122e-07,
      "loss": 0.0002,
      "num_tokens": 125344439.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4558,
      "step_time": 20.087486639618874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 130.5,
      "completions/mean_terminated_length": 130.5,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2757374122738838,
      "epoch": 0.211162575266327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008420100435614586,
      "kl": 0.003156241960823536,
      "learning_rate": 9.577767484946733e-07,
      "loss": 0.0002,
      "num_tokens": 125364255.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4559,
      "step_time": 16.498020764440298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 250.8125,
      "completions/mean_terminated_length": 250.8125,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "entropy": 0.24240896478295326,
      "epoch": 0.2112088930060213,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07986567914485931,
      "kl": 0.00367005035514012,
      "learning_rate": 9.577674849467346e-07,
      "loss": 0.025,
      "num_tokens": 125397404.0,
      "reward": 0.9970567226409912,
      "reward_std": 0.008042472414672375,
      "rewards/reward_func/mean": 0.9970567226409912,
      "rewards/reward_func/std": 0.008042463101446629,
      "step": 4560,
      "step_time": 29.679964408278465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 183.6875,
      "completions/mean_terminated_length": 183.6875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.4318304732441902,
      "epoch": 0.2112552107457156,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.024634553119540215,
      "kl": 0.010949882445856929,
      "learning_rate": 9.577582213987957e-07,
      "loss": 0.0024,
      "num_tokens": 125419463.0,
      "reward": 1.0134880540135782e-05,
      "reward_std": 4.053952216054313e-05,
      "rewards/reward_func/mean": 1.0134880540135782e-05,
      "rewards/reward_func/std": 4.053952216054313e-05,
      "step": 4561,
      "step_time": 22.271343171596527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 169.0,
      "completions/mean_terminated_length": 169.0,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.3239048570394516,
      "epoch": 0.21130152848540992,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11310670524835587,
      "kl": 0.008481680473778397,
      "learning_rate": 9.577489578508569e-07,
      "loss": 0.1158,
      "num_tokens": 125449159.0,
      "reward": 0.8728713393211365,
      "reward_std": 0.23276567459106445,
      "rewards/reward_func/mean": 0.8728713393211365,
      "rewards/reward_func/std": 0.23276568949222565,
      "step": 4562,
      "step_time": 26.040478244423866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 125.3125,
      "completions/mean_terminated_length": 125.3125,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.3014897257089615,
      "epoch": 0.21134784622510422,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033178734593093395,
      "kl": 0.002205540135037154,
      "learning_rate": 9.57739694302918e-07,
      "loss": 0.0001,
      "num_tokens": 125471180.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4563,
      "step_time": 15.17513533681631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 215.75,
      "completions/mean_terminated_length": 215.75,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "entropy": 0.3299860283732414,
      "epoch": 0.21139416396479851,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1173308938741684,
      "kl": 0.005431530880741775,
      "learning_rate": 9.577304307549791e-07,
      "loss": 0.0216,
      "num_tokens": 125508904.0,
      "reward": 0.8586323261260986,
      "reward_std": 0.14511004090309143,
      "rewards/reward_func/mean": 0.8586323261260986,
      "rewards/reward_func/std": 0.14511004090309143,
      "step": 4564,
      "step_time": 29.23145243152976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 145.4375,
      "completions/mean_terminated_length": 145.4375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.29369380325078964,
      "epoch": 0.2114404817044928,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00887998379766941,
      "kl": 0.0034651908790692687,
      "learning_rate": 9.577211672070402e-07,
      "loss": 0.0002,
      "num_tokens": 125529839.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4565,
      "step_time": 18.286695763468742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 195.5625,
      "completions/mean_terminated_length": 195.5625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4388611167669296,
      "epoch": 0.21148679944418713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007069175597280264,
      "kl": 0.005334179208148271,
      "learning_rate": 9.577119036591014e-07,
      "loss": 0.0003,
      "num_tokens": 125555224.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4566,
      "step_time": 24.03802677616477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 174.75,
      "completions/mean_terminated_length": 174.75,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2579498067498207,
      "epoch": 0.21153311718388143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10790275037288666,
      "kl": 0.006988507346250117,
      "learning_rate": 9.577026401111625e-07,
      "loss": 0.0027,
      "num_tokens": 125576084.0,
      "reward": 0.75,
      "reward_std": 0.4472135901451111,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.44721361994743347,
      "step": 4567,
      "step_time": 20.670855939388275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 125.125,
      "completions/mean_terminated_length": 125.125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.29111990332603455,
      "epoch": 0.21157943492357573,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004740697797387838,
      "kl": 0.002562293957453221,
      "learning_rate": 9.576933765632236e-07,
      "loss": 0.0001,
      "num_tokens": 125595654.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4568,
      "step_time": 16.81375139206648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 223.5625,
      "completions/mean_terminated_length": 223.5625,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "entropy": 0.19886551052331924,
      "epoch": 0.21162575266327002,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08791195601224899,
      "kl": 0.005757181206718087,
      "learning_rate": 9.57684113015285e-07,
      "loss": 0.0125,
      "num_tokens": 125621247.0,
      "reward": 0.05890907347202301,
      "reward_std": 0.012393509037792683,
      "rewards/reward_func/mean": 0.05890907347202301,
      "rewards/reward_func/std": 0.012393510900437832,
      "step": 4569,
      "step_time": 25.73760474100709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 148.625,
      "completions/mean_terminated_length": 148.625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.2894188240170479,
      "epoch": 0.21167207040296435,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005633847322314978,
      "kl": 0.0026751953992061317,
      "learning_rate": 9.57674849467346e-07,
      "loss": 0.0001,
      "num_tokens": 125642233.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4570,
      "step_time": 21.330371465533972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 185.5625,
      "completions/mean_terminated_length": 185.5625,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.19079741090536118,
      "epoch": 0.21171838814265864,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007866875268518925,
      "kl": 0.005902003264054656,
      "learning_rate": 9.57665585919407e-07,
      "loss": 0.0003,
      "num_tokens": 125664850.0,
      "reward": 0.9181891679763794,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9181891679763794,
      "rewards/reward_func/std": 0.0,
      "step": 4571,
      "step_time": 20.46226677671075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 139.25,
      "completions/mean_terminated_length": 139.25,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.29646675288677216,
      "epoch": 0.21176470588235294,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030221971683204174,
      "kl": 0.002487851830665022,
      "learning_rate": 9.576563223714681e-07,
      "loss": 0.0001,
      "num_tokens": 125690518.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4572,
      "step_time": 17.496889680624008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.0,
      "completions/max_terminated_length": 184.0,
      "completions/mean_length": 141.5,
      "completions/mean_terminated_length": 141.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.32203683257102966,
      "epoch": 0.21181102362204723,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003484180895611644,
      "kl": 0.0019459775066934526,
      "learning_rate": 9.576470588235294e-07,
      "loss": 0.0001,
      "num_tokens": 125711294.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4573,
      "step_time": 19.43079449236393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 130.75,
      "completions/mean_terminated_length": 130.75,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.2513924464583397,
      "epoch": 0.21185734136174156,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004076133482158184,
      "kl": 0.0025015312421601266,
      "learning_rate": 9.576377952755906e-07,
      "loss": 0.0001,
      "num_tokens": 125730970.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4574,
      "step_time": 15.779032353311777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 195.875,
      "completions/mean_terminated_length": 195.875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.23622899502515793,
      "epoch": 0.21190365910143585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003442511660978198,
      "kl": 0.012230751803144813,
      "learning_rate": 9.576285317276517e-07,
      "loss": 0.0006,
      "num_tokens": 125763224.0,
      "reward": 0.3448947072029114,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3448947072029114,
      "rewards/reward_func/std": 0.0,
      "step": 4575,
      "step_time": 24.85686208307743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 188.9375,
      "completions/mean_terminated_length": 188.9375,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.2207338623702526,
      "epoch": 0.21194997684113015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16800262033939362,
      "kl": 0.008518489601556212,
      "learning_rate": 9.576192681797128e-07,
      "loss": -0.0413,
      "num_tokens": 125786407.0,
      "reward": 0.6571069955825806,
      "reward_std": 0.1239239051938057,
      "rewards/reward_func/mean": 0.6571069955825806,
      "rewards/reward_func/std": 0.1239239051938057,
      "step": 4576,
      "step_time": 24.301697324961424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 181.0,
      "completions/mean_terminated_length": 181.0,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.2521881014108658,
      "epoch": 0.21199629458082445,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10636038333177567,
      "kl": 0.011740713787730783,
      "learning_rate": 9.57610004631774e-07,
      "loss": 0.0301,
      "num_tokens": 125808263.0,
      "reward": 0.973493218421936,
      "reward_std": 0.031041564419865608,
      "rewards/reward_func/mean": 0.973493218421936,
      "rewards/reward_func/std": 0.031041564419865608,
      "step": 4577,
      "step_time": 22.679786428809166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 224.875,
      "completions/mean_terminated_length": 224.875,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.2899409234523773,
      "epoch": 0.21204261232051877,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031824936158955097,
      "kl": 0.0034957891330122948,
      "learning_rate": 9.57600741083835e-07,
      "loss": 0.0002,
      "num_tokens": 125833605.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4578,
      "step_time": 30.98334626108408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 298.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 217.5,
      "completions/mean_terminated_length": 217.5,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.28880978375673294,
      "epoch": 0.21208893006021307,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10978582501411438,
      "kl": 0.006959647638723254,
      "learning_rate": 9.575914775358962e-07,
      "loss": -0.0325,
      "num_tokens": 125859501.0,
      "reward": 0.46855640411376953,
      "reward_std": 0.48227450251579285,
      "rewards/reward_func/mean": 0.46855640411376953,
      "rewards/reward_func/std": 0.48227453231811523,
      "step": 4579,
      "step_time": 28.84536913409829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 133.8125,
      "completions/mean_terminated_length": 133.8125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2924625501036644,
      "epoch": 0.21213524779990736,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031335516832768917,
      "kl": 0.0021028506162110716,
      "learning_rate": 9.575822139879573e-07,
      "loss": 0.0001,
      "num_tokens": 125880154.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4580,
      "step_time": 18.896043598651886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 124.1875,
      "completions/mean_terminated_length": 124.1875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.31091052293777466,
      "epoch": 0.21218156553960166,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004532175604254007,
      "kl": 0.002397934324108064,
      "learning_rate": 9.575729504400184e-07,
      "loss": 0.0001,
      "num_tokens": 125900573.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4581,
      "step_time": 16.21372254192829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 175.125,
      "completions/mean_terminated_length": 175.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.18246766552329063,
      "epoch": 0.21222788327929598,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004709724336862564,
      "kl": 0.0029257500427775085,
      "learning_rate": 9.575636868920796e-07,
      "loss": 0.0001,
      "num_tokens": 125934751.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4582,
      "step_time": 22.58020205423236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 118.875,
      "completions/mean_terminated_length": 118.875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2522609978914261,
      "epoch": 0.21227420101899028,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001804339000955224,
      "kl": 0.0015569084498565644,
      "learning_rate": 9.575544233441407e-07,
      "loss": 0.0001,
      "num_tokens": 125954877.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4583,
      "step_time": 14.784600455313921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 182.9375,
      "completions/mean_terminated_length": 182.9375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.27114830538630486,
      "epoch": 0.21232051875868457,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00425915839150548,
      "kl": 0.0043210614239797,
      "learning_rate": 9.575451597962018e-07,
      "loss": 0.0002,
      "num_tokens": 125981980.0,
      "reward": 0.9036020040512085,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9036020040512085,
      "rewards/reward_func/std": 0.0,
      "step": 4584,
      "step_time": 29.392589770257473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 166.75,
      "completions/mean_terminated_length": 166.75,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.23679695650935173,
      "epoch": 0.21236683649837887,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21038685739040375,
      "kl": 0.013930513057857752,
      "learning_rate": 9.57535896248263e-07,
      "loss": -0.0353,
      "num_tokens": 126018424.0,
      "reward": 0.82862389087677,
      "reward_std": 0.10218896716833115,
      "rewards/reward_func/mean": 0.82862389087677,
      "rewards/reward_func/std": 0.10218898206949234,
      "step": 4585,
      "step_time": 22.847710091620684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 133.3125,
      "completions/mean_terminated_length": 133.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.2652202919125557,
      "epoch": 0.2124131542380732,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024052548687905073,
      "kl": 0.0017215238767676055,
      "learning_rate": 9.575266327003243e-07,
      "loss": 0.0001,
      "num_tokens": 126040077.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4586,
      "step_time": 16.063202656805515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 647.0,
      "completions/max_terminated_length": 647.0,
      "completions/mean_length": 430.625,
      "completions/mean_terminated_length": 430.625,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "entropy": 0.3443702310323715,
      "epoch": 0.2124594719777675,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06466368585824966,
      "kl": 0.009476348757743835,
      "learning_rate": 9.575173691523854e-07,
      "loss": -0.147,
      "num_tokens": 126073319.0,
      "reward": 0.17105086147785187,
      "reward_std": 0.14411544799804688,
      "rewards/reward_func/mean": 0.17105086147785187,
      "rewards/reward_func/std": 0.14411544799804688,
      "step": 4587,
      "step_time": 58.89421058818698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 193.4375,
      "completions/mean_terminated_length": 193.4375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.1631155014038086,
      "epoch": 0.21250578971746178,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004590533673763275,
      "kl": 0.0763822328299284,
      "learning_rate": 9.575081056044465e-07,
      "loss": 0.0038,
      "num_tokens": 126102574.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4588,
      "step_time": 23.851110119372606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 177.5625,
      "completions/mean_terminated_length": 177.5625,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.428097628057003,
      "epoch": 0.21255210745715608,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042982702143490314,
      "kl": 0.003557933960109949,
      "learning_rate": 9.574988420565074e-07,
      "loss": 0.0002,
      "num_tokens": 126148007.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4589,
      "step_time": 28.871601589024067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 122.5,
      "completions/mean_terminated_length": 122.5,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.3085201680660248,
      "epoch": 0.2125984251968504,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005929566919803619,
      "kl": 0.003112640930339694,
      "learning_rate": 9.574895785085688e-07,
      "loss": 0.0002,
      "num_tokens": 126170511.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4590,
      "step_time": 16.47818062081933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 208.4375,
      "completions/mean_terminated_length": 208.4375,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.21477292105555534,
      "epoch": 0.2126447429365447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004304870497435331,
      "kl": 0.003596308291889727,
      "learning_rate": 9.5748031496063e-07,
      "loss": 0.0002,
      "num_tokens": 126195174.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4591,
      "step_time": 23.432234924286604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 201.75,
      "completions/mean_terminated_length": 201.75,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.3422112464904785,
      "epoch": 0.212691060676239,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14413142204284668,
      "kl": 0.017712951870635152,
      "learning_rate": 9.57471051412691e-07,
      "loss": -0.0043,
      "num_tokens": 126222178.0,
      "reward": 0.0016898601315915585,
      "reward_std": 0.006743177305907011,
      "rewards/reward_func/mean": 0.0016898601315915585,
      "rewards/reward_func/std": 0.006743177771568298,
      "step": 4592,
      "step_time": 26.71819446235895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 160.875,
      "completions/mean_terminated_length": 160.875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.16070354357361794,
      "epoch": 0.2127373784159333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.023666655644774437,
      "kl": 0.001973198464838788,
      "learning_rate": 9.574617878647522e-07,
      "loss": 0.0001,
      "num_tokens": 126245424.0,
      "reward": 0.9310627579689026,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9310627579689026,
      "rewards/reward_func/std": 0.0,
      "step": 4593,
      "step_time": 19.66918433830142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 183.9375,
      "completions/mean_terminated_length": 183.9375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.3029061332345009,
      "epoch": 0.21278369615562762,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10588514804840088,
      "kl": 0.023241990245878696,
      "learning_rate": 9.574525243168133e-07,
      "loss": 0.0331,
      "num_tokens": 126268447.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4594,
      "step_time": 22.20823608711362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 157.625,
      "completions/mean_terminated_length": 157.625,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2079700380563736,
      "epoch": 0.2128300138953219,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19576317071914673,
      "kl": 0.02368050836957991,
      "learning_rate": 9.574432607688744e-07,
      "loss": -0.2295,
      "num_tokens": 126290633.0,
      "reward": 0.3210454285144806,
      "reward_std": 0.3790600001811981,
      "rewards/reward_func/mean": 0.3210454285144806,
      "rewards/reward_func/std": 0.3790600001811981,
      "step": 4595,
      "step_time": 23.530725929886103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 141.125,
      "completions/mean_terminated_length": 141.125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.33004576712846756,
      "epoch": 0.2128763316350162,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00828706193715334,
      "kl": 0.004713651491329074,
      "learning_rate": 9.574339972209355e-07,
      "loss": 0.0002,
      "num_tokens": 126312571.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4596,
      "step_time": 19.159527648240328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.0,
      "completions/max_terminated_length": 134.0,
      "completions/mean_length": 115.875,
      "completions/mean_terminated_length": 115.875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.23844431713223457,
      "epoch": 0.2129226493747105,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01080168504267931,
      "kl": 0.004030712181702256,
      "learning_rate": 9.574247336729967e-07,
      "loss": 0.0002,
      "num_tokens": 126331897.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4597,
      "step_time": 14.468946900218725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 170.5,
      "completions/mean_terminated_length": 170.5,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.1602429263293743,
      "epoch": 0.21296896711440483,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12220467627048492,
      "kl": 0.00193205161485821,
      "learning_rate": 9.574154701250578e-07,
      "loss": -0.0131,
      "num_tokens": 126369793.0,
      "reward": 0.8918383121490479,
      "reward_std": 0.0536632239818573,
      "rewards/reward_func/mean": 0.8918383121490479,
      "rewards/reward_func/std": 0.0536632314324379,
      "step": 4598,
      "step_time": 22.584174297749996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 154.3125,
      "completions/mean_terminated_length": 154.3125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.4211675226688385,
      "epoch": 0.21301528485409912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018207606626674533,
      "kl": 0.0016961352666839957,
      "learning_rate": 9.574062065771191e-07,
      "loss": 0.0001,
      "num_tokens": 126401718.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4599,
      "step_time": 20.368704564869404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 187.9375,
      "completions/mean_terminated_length": 187.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.2625105753540993,
      "epoch": 0.21306160259379342,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11870481818914413,
      "kl": 0.008358007995411754,
      "learning_rate": 9.573969430291802e-07,
      "loss": -0.0239,
      "num_tokens": 126424341.0,
      "reward": 0.9573257565498352,
      "reward_std": 0.01665830798447132,
      "rewards/reward_func/mean": 0.9573257565498352,
      "rewards/reward_func/std": 0.01665831357240677,
      "step": 4600,
      "step_time": 21.436405293643475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 185.9375,
      "completions/mean_terminated_length": 185.9375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.2466711439192295,
      "epoch": 0.21310792033348772,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007192135322839022,
      "kl": 0.029116731602698565,
      "learning_rate": 9.573876794812412e-07,
      "loss": 0.0014,
      "num_tokens": 126453316.0,
      "reward": 0.9682132601737976,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9682132601737976,
      "rewards/reward_func/std": 0.0,
      "step": 4601,
      "step_time": 22.567537255585194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 165.4375,
      "completions/mean_terminated_length": 165.4375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.25666791945695877,
      "epoch": 0.21315423807318204,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32522323727607727,
      "kl": 0.06802494544535875,
      "learning_rate": 9.573784159333023e-07,
      "loss": -0.0251,
      "num_tokens": 126474011.0,
      "reward": 0.8512284755706787,
      "reward_std": 0.06417026370763779,
      "rewards/reward_func/mean": 0.8512284755706787,
      "rewards/reward_func/std": 0.06417026370763779,
      "step": 4602,
      "step_time": 19.077897660434246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 182.875,
      "completions/mean_terminated_length": 182.875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.49531012773513794,
      "epoch": 0.21320055581287634,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0043403939343988895,
      "kl": 0.003245527681428939,
      "learning_rate": 9.573691523853636e-07,
      "loss": 0.0002,
      "num_tokens": 126509193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4603,
      "step_time": 23.65907971560955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 292.25,
      "completions/mean_terminated_length": 292.25,
      "completions/min_length": 266.0,
      "completions/min_terminated_length": 266.0,
      "entropy": 0.21409835293889046,
      "epoch": 0.21324687355257063,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09618958830833435,
      "kl": 0.003828092012554407,
      "learning_rate": 9.573598888374247e-07,
      "loss": -0.0234,
      "num_tokens": 126537741.0,
      "reward": 0.9188828468322754,
      "reward_std": 0.08377746492624283,
      "rewards/reward_func/mean": 0.9188828468322754,
      "rewards/reward_func/std": 0.08377746492624283,
      "step": 4604,
      "step_time": 29.859783098101616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 135.125,
      "completions/mean_terminated_length": 135.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3226679340004921,
      "epoch": 0.21329319129226493,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025867170188575983,
      "kl": 0.002313228149432689,
      "learning_rate": 9.573506252894859e-07,
      "loss": 0.0001,
      "num_tokens": 126558447.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4605,
      "step_time": 16.92801634594798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 163.5,
      "completions/mean_terminated_length": 163.5,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.21732071042060852,
      "epoch": 0.21333950903195925,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003922209143638611,
      "kl": 0.004118205222766846,
      "learning_rate": 9.57341361741547e-07,
      "loss": 0.0002,
      "num_tokens": 126579351.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4606,
      "step_time": 18.701394371688366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 140.25,
      "completions/mean_terminated_length": 140.25,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.27046389877796173,
      "epoch": 0.21338582677165355,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003340133000165224,
      "kl": 0.002406741550657898,
      "learning_rate": 9.573320981936081e-07,
      "loss": 0.0001,
      "num_tokens": 126599611.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4607,
      "step_time": 17.629219640046358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 124.125,
      "completions/mean_terminated_length": 124.125,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "entropy": 0.3118182420730591,
      "epoch": 0.21343214451134784,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012819971889257431,
      "kl": 0.003876145579852164,
      "learning_rate": 9.573228346456692e-07,
      "loss": 0.0002,
      "num_tokens": 126622557.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4608,
      "step_time": 16.826832067221403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 176.25,
      "completions/mean_terminated_length": 176.25,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.32553625851869583,
      "epoch": 0.21347846225104214,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.018847228959202766,
      "kl": 0.01652515958994627,
      "learning_rate": 9.573135710977304e-07,
      "loss": 0.0008,
      "num_tokens": 126643489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4609,
      "step_time": 24.18261268734932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 312.0,
      "completions/max_terminated_length": 312.0,
      "completions/mean_length": 200.1875,
      "completions/mean_terminated_length": 200.1875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.40819302946329117,
      "epoch": 0.21352477999073646,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13739241659641266,
      "kl": 0.015501508256420493,
      "learning_rate": 9.573043075497915e-07,
      "loss": -0.0506,
      "num_tokens": 126667332.0,
      "reward": 0.0253753662109375,
      "reward_std": 0.054555393755435944,
      "rewards/reward_func/mean": 0.0253753662109375,
      "rewards/reward_func/std": 0.05455539748072624,
      "step": 4610,
      "step_time": 29.842195238918066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 157.625,
      "completions/mean_terminated_length": 157.625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.4177410379052162,
      "epoch": 0.21357109773043076,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15692751109600067,
      "kl": 0.00395078951260075,
      "learning_rate": 9.572950440018526e-07,
      "loss": -0.135,
      "num_tokens": 126702350.0,
      "reward": 0.016619674861431122,
      "reward_std": 0.06647869944572449,
      "rewards/reward_func/mean": 0.016619674861431122,
      "rewards/reward_func/std": 0.06647869944572449,
      "step": 4611,
      "step_time": 27.532116916030645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 212.9375,
      "completions/mean_terminated_length": 212.9375,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.3895299881696701,
      "epoch": 0.21361741547012505,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12125103175640106,
      "kl": 0.005163967609405518,
      "learning_rate": 9.572857804539137e-07,
      "loss": 0.032,
      "num_tokens": 126725389.0,
      "reward": 0.06860418617725372,
      "reward_std": 0.23119214177131653,
      "rewards/reward_func/mean": 0.06860418617725372,
      "rewards/reward_func/std": 0.23119214177131653,
      "step": 4612,
      "step_time": 23.18581724539399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 114.625,
      "completions/mean_terminated_length": 114.625,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.2978924736380577,
      "epoch": 0.21366373320981935,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004663385450839996,
      "kl": 0.0027721599326469004,
      "learning_rate": 9.57276516905975e-07,
      "loss": 0.0001,
      "num_tokens": 126746151.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4613,
      "step_time": 16.64656350389123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 182.875,
      "completions/mean_terminated_length": 182.875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.3374093174934387,
      "epoch": 0.21371005094951367,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039483108557760715,
      "kl": 0.003512556490022689,
      "learning_rate": 9.57267253358036e-07,
      "loss": 0.0002,
      "num_tokens": 126771493.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4614,
      "step_time": 22.854474045336246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.0,
      "completions/max_terminated_length": 160.0,
      "completions/mean_length": 141.1875,
      "completions/mean_terminated_length": 141.1875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3065594434738159,
      "epoch": 0.21375636868920797,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005497485864907503,
      "kl": 0.003522971353959292,
      "learning_rate": 9.572579898100971e-07,
      "loss": 0.0002,
      "num_tokens": 126795496.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4615,
      "step_time": 17.771773859858513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 132.4375,
      "completions/mean_terminated_length": 132.4375,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.3171450048685074,
      "epoch": 0.21380268642890227,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020709557458758354,
      "kl": 0.005141422443557531,
      "learning_rate": 9.572487262621585e-07,
      "loss": 0.0003,
      "num_tokens": 126815903.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4616,
      "step_time": 16.291943036019802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 194.875,
      "completions/mean_terminated_length": 194.875,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.15820416063070297,
      "epoch": 0.21384900416859656,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011281813494861126,
      "kl": 0.053559258580207825,
      "learning_rate": 9.572394627142196e-07,
      "loss": 0.0027,
      "num_tokens": 126837197.0,
      "reward": 0.5706745982170105,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.5706745982170105,
      "rewards/reward_func/std": 0.0,
      "step": 4617,
      "step_time": 22.558176815509796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 139.25,
      "completions/mean_terminated_length": 139.25,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.3524722456932068,
      "epoch": 0.2138953219082909,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023935355711728334,
      "kl": 0.0020296122529543936,
      "learning_rate": 9.572301991662807e-07,
      "loss": 0.0001,
      "num_tokens": 126858001.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4618,
      "step_time": 16.64563300460577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 138.5,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.30146465450525284,
      "epoch": 0.21394163964798518,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034549932461231947,
      "kl": 0.00310306076426059,
      "learning_rate": 9.572209356183418e-07,
      "loss": 0.0002,
      "num_tokens": 126878137.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4619,
      "step_time": 17.105186745524406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 162.5,
      "completions/mean_terminated_length": 162.5,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.313643679022789,
      "epoch": 0.21398795738767948,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004588900599628687,
      "kl": 0.0025320181739516556,
      "learning_rate": 9.57211672070403e-07,
      "loss": 0.0001,
      "num_tokens": 126904289.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4620,
      "step_time": 19.462565012276173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 129.8125,
      "completions/mean_terminated_length": 129.8125,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.26016219705343246,
      "epoch": 0.21403427512737377,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006031656637787819,
      "kl": 0.0028295708762016147,
      "learning_rate": 9.57202408522464e-07,
      "loss": 0.0001,
      "num_tokens": 126928798.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4621,
      "step_time": 16.489441718906164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 137.625,
      "completions/mean_terminated_length": 137.625,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.2527787424623966,
      "epoch": 0.2140805928670681,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27769744396209717,
      "kl": 0.15016119182109833,
      "learning_rate": 9.571931449745252e-07,
      "loss": -0.0194,
      "num_tokens": 126951032.0,
      "reward": 0.9095566272735596,
      "reward_std": 0.24713833630084991,
      "rewards/reward_func/mean": 0.9095566272735596,
      "rewards/reward_func/std": 0.24713833630084991,
      "step": 4622,
      "step_time": 16.10928536951542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 288.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 210.8125,
      "completions/mean_terminated_length": 210.8125,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.47990524023771286,
      "epoch": 0.2141269106067624,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11754059791564941,
      "kl": 0.010775170288980007,
      "learning_rate": 9.571838814265863e-07,
      "loss": 0.1185,
      "num_tokens": 126972773.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 4623,
      "step_time": 28.969821341335773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 160.125,
      "completions/mean_terminated_length": 160.125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.45073506236076355,
      "epoch": 0.2141732283464567,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002356166485697031,
      "kl": 0.002657567209098488,
      "learning_rate": 9.571746178786475e-07,
      "loss": 0.0001,
      "num_tokens": 127023063.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4624,
      "step_time": 30.01600630953908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 172.8125,
      "completions/mean_terminated_length": 172.8125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.36030296981334686,
      "epoch": 0.21421954608615099,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004975039511919022,
      "kl": 0.004276565974578261,
      "learning_rate": 9.571653543307086e-07,
      "loss": 0.0002,
      "num_tokens": 127045972.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4625,
      "step_time": 22.17604398727417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 160.125,
      "completions/mean_terminated_length": 160.125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.22017846629023552,
      "epoch": 0.2142658638258453,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13473278284072876,
      "kl": 0.004622761334758252,
      "learning_rate": 9.571560907827697e-07,
      "loss": 0.0022,
      "num_tokens": 127071798.0,
      "reward": 0.8826128244400024,
      "reward_std": 0.04670385643839836,
      "rewards/reward_func/mean": 0.8826128244400024,
      "rewards/reward_func/std": 0.04670385643839836,
      "step": 4626,
      "step_time": 20.95901571586728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 243.25,
      "completions/mean_terminated_length": 243.25,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "entropy": 0.21980761364102364,
      "epoch": 0.2143121815655396,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1065199077129364,
      "kl": 0.026685321470722556,
      "learning_rate": 9.571468272348308e-07,
      "loss": -0.0192,
      "num_tokens": 127101306.0,
      "reward": 0.45331358909606934,
      "reward_std": 0.08640240132808685,
      "rewards/reward_func/mean": 0.45331358909606934,
      "rewards/reward_func/std": 0.08640240877866745,
      "step": 4627,
      "step_time": 28.487680412828922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 153.8125,
      "completions/mean_terminated_length": 153.8125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.42449887096881866,
      "epoch": 0.2143584993052339,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018880803836509585,
      "kl": 0.002178079157602042,
      "learning_rate": 9.57137563686892e-07,
      "loss": 0.0001,
      "num_tokens": 127128359.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4628,
      "step_time": 18.94478588551283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 350.125,
      "completions/mean_terminated_length": 350.125,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "entropy": 0.17183669283986092,
      "epoch": 0.2144048170449282,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14275139570236206,
      "kl": 0.002390524576185271,
      "learning_rate": 9.57128300138953e-07,
      "loss": 0.0254,
      "num_tokens": 127160553.0,
      "reward": 0.9952442646026611,
      "reward_std": 0.012995131313800812,
      "rewards/reward_func/mean": 0.9952442646026611,
      "rewards/reward_func/std": 0.01299512293189764,
      "step": 4629,
      "step_time": 39.509769801050425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 167.9375,
      "completions/mean_terminated_length": 167.9375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4044266939163208,
      "epoch": 0.21445113478462252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012995369732379913,
      "kl": 0.011043486651033163,
      "learning_rate": 9.571190365910144e-07,
      "loss": 0.0005,
      "num_tokens": 127181640.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4630,
      "step_time": 20.03908582776785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 141.75,
      "completions/mean_terminated_length": 141.75,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.12327790260314941,
      "epoch": 0.21449745252431682,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023728630039840937,
      "kl": 0.0017822127847466618,
      "learning_rate": 9.571097730430755e-07,
      "loss": 0.0001,
      "num_tokens": 127203524.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 4631,
      "step_time": 19.283959042280912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 188.4375,
      "completions/mean_terminated_length": 188.4375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.26789214089512825,
      "epoch": 0.2145437702640111,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11501730978488922,
      "kl": 0.010447415814269334,
      "learning_rate": 9.571005094951365e-07,
      "loss": -0.0176,
      "num_tokens": 127224379.0,
      "reward": 0.9259670972824097,
      "reward_std": 0.0509251244366169,
      "rewards/reward_func/mean": 0.9259670972824097,
      "rewards/reward_func/std": 0.050925128161907196,
      "step": 4632,
      "step_time": 21.821676589548588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 346.0,
      "completions/max_terminated_length": 346.0,
      "completions/mean_length": 271.375,
      "completions/mean_terminated_length": 271.375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.3288490027189255,
      "epoch": 0.2145900880037054,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07984766364097595,
      "kl": 0.014920821180567145,
      "learning_rate": 9.570912459471978e-07,
      "loss": -0.1445,
      "num_tokens": 127250673.0,
      "reward": 0.5475016236305237,
      "reward_std": 0.3812312185764313,
      "rewards/reward_func/mean": 0.5475016236305237,
      "rewards/reward_func/std": 0.3812311887741089,
      "step": 4633,
      "step_time": 32.65467547252774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 134.4375,
      "completions/mean_terminated_length": 134.4375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.3110058158636093,
      "epoch": 0.21463640574339973,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003320001531392336,
      "kl": 0.0024003706639632583,
      "learning_rate": 9.57081982399259e-07,
      "loss": 0.0001,
      "num_tokens": 127272712.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4634,
      "step_time": 16.119121961295605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 153.1875,
      "completions/mean_terminated_length": 153.1875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.39740072190761566,
      "epoch": 0.21468272348309403,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002264913870021701,
      "kl": 0.0023553372593596578,
      "learning_rate": 9.5707271885132e-07,
      "loss": 0.0001,
      "num_tokens": 127306059.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4635,
      "step_time": 21.171541515737772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 167.375,
      "completions/mean_terminated_length": 167.375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4089849814772606,
      "epoch": 0.21472904122278832,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030034270603209734,
      "kl": 0.002874799771234393,
      "learning_rate": 9.570634553033812e-07,
      "loss": 0.0001,
      "num_tokens": 127360193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4636,
      "step_time": 28.686766408383846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 140.125,
      "completions/mean_terminated_length": 140.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.29447612166404724,
      "epoch": 0.21477535896248262,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00416575139388442,
      "kl": 0.002283772628288716,
      "learning_rate": 9.570541917554423e-07,
      "loss": 0.0001,
      "num_tokens": 127390147.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4637,
      "step_time": 20.987193293869495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 146.4375,
      "completions/mean_terminated_length": 146.4375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.29804428666830063,
      "epoch": 0.21482167670217694,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004655946046113968,
      "kl": 0.003044213750399649,
      "learning_rate": 9.570449282075034e-07,
      "loss": 0.0002,
      "num_tokens": 127417242.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4638,
      "step_time": 19.60076381638646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 226.8125,
      "completions/mean_terminated_length": 226.8125,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.32897651195526123,
      "epoch": 0.21486799444187124,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9647154211997986,
      "kl": 0.021434172056615353,
      "learning_rate": 9.570356646595645e-07,
      "loss": -0.0003,
      "num_tokens": 127449847.0,
      "reward": 0.6860184073448181,
      "reward_std": 0.40906235575675964,
      "rewards/reward_func/mean": 0.6860184073448181,
      "rewards/reward_func/std": 0.40906238555908203,
      "step": 4639,
      "step_time": 26.903940606862307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 186.625,
      "completions/mean_terminated_length": 186.625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.40722519904375076,
      "epoch": 0.21491431218156554,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006639092694967985,
      "kl": 0.005273929215036333,
      "learning_rate": 9.570264011116257e-07,
      "loss": 0.0003,
      "num_tokens": 127474737.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4640,
      "step_time": 23.84171436727047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 174.3125,
      "completions/mean_terminated_length": 174.3125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.18666213005781174,
      "epoch": 0.21496062992125983,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11597719043493271,
      "kl": 0.004080721584614366,
      "learning_rate": 9.570171375636868e-07,
      "loss": 0.0008,
      "num_tokens": 127497670.0,
      "reward": 0.7124775648117065,
      "reward_std": 0.07721612602472305,
      "rewards/reward_func/mean": 0.7124775648117065,
      "rewards/reward_func/std": 0.07721611857414246,
      "step": 4641,
      "step_time": 19.928462456911802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 183.9375,
      "completions/mean_terminated_length": 183.9375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.20664877444505692,
      "epoch": 0.21500694766095416,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10467500984668732,
      "kl": 0.016906562726944685,
      "learning_rate": 9.57007874015748e-07,
      "loss": -0.0046,
      "num_tokens": 127519061.0,
      "reward": 0.9768627882003784,
      "reward_std": 0.035443443804979324,
      "rewards/reward_func/mean": 0.9768627882003784,
      "rewards/reward_func/std": 0.03544343635439873,
      "step": 4642,
      "step_time": 21.659313656389713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 131.4375,
      "completions/mean_terminated_length": 131.4375,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.23793626576662064,
      "epoch": 0.21505326540064845,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002327620517462492,
      "kl": 0.0018894985259976238,
      "learning_rate": 9.569986104678092e-07,
      "loss": 0.0001,
      "num_tokens": 127541212.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4643,
      "step_time": 16.452261183410883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 170.0,
      "completions/mean_terminated_length": 170.0,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.37210172414779663,
      "epoch": 0.21509958314034275,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006618363782763481,
      "kl": 0.0028220415115356445,
      "learning_rate": 9.569893469198704e-07,
      "loss": 0.0001,
      "num_tokens": 127570380.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4644,
      "step_time": 21.53874461352825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 149.3125,
      "completions/mean_terminated_length": 149.3125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.308118037879467,
      "epoch": 0.21514590088003704,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026913676410913467,
      "kl": 0.002246640477096662,
      "learning_rate": 9.569800833719313e-07,
      "loss": 0.0001,
      "num_tokens": 127597985.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4645,
      "step_time": 18.83286164328456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 127.3125,
      "completions/mean_terminated_length": 127.3125,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2763524055480957,
      "epoch": 0.21519221861973137,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024202989879995584,
      "kl": 0.0019912614952772856,
      "learning_rate": 9.569708198239926e-07,
      "loss": 0.0001,
      "num_tokens": 127622374.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4646,
      "step_time": 17.542009364813566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 178.4375,
      "completions/mean_terminated_length": 178.4375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.42642128467559814,
      "epoch": 0.21523853635942566,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009753278456628323,
      "kl": 0.006714976276271045,
      "learning_rate": 9.569615562760537e-07,
      "loss": 0.0003,
      "num_tokens": 127651565.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4647,
      "step_time": 23.6161276884377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 174.5625,
      "completions/mean_terminated_length": 174.5625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.4136885926127434,
      "epoch": 0.21528485409911996,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028628241270780563,
      "kl": 0.0025501035270281136,
      "learning_rate": 9.569522927281149e-07,
      "loss": 0.0001,
      "num_tokens": 127694294.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4648,
      "step_time": 27.321494657546282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 224.4375,
      "completions/mean_terminated_length": 224.4375,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.4577714130282402,
      "epoch": 0.21533117183881426,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012676808051764965,
      "kl": 0.011420159135013819,
      "learning_rate": 9.56943029180176e-07,
      "loss": 0.0006,
      "num_tokens": 127721149.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4649,
      "step_time": 33.25904209911823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 177.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.14692023023962975,
      "epoch": 0.21537748957850858,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001845229766331613,
      "kl": 0.02969865733757615,
      "learning_rate": 9.569337656322371e-07,
      "loss": 0.0015,
      "num_tokens": 127742626.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4650,
      "step_time": 18.386997617781162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 125.9375,
      "completions/mean_terminated_length": 125.9375,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2768767401576042,
      "epoch": 0.21542380731820288,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006206747610121965,
      "kl": 0.002629841648740694,
      "learning_rate": 9.569245020842982e-07,
      "loss": 0.0001,
      "num_tokens": 127764305.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4651,
      "step_time": 17.05055009201169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 164.1875,
      "completions/mean_terminated_length": 164.1875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.34208963066339493,
      "epoch": 0.21547012505789717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035113864578306675,
      "kl": 0.0023904606932774186,
      "learning_rate": 9.569152385363594e-07,
      "loss": 0.0001,
      "num_tokens": 127791476.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4652,
      "step_time": 21.758243769407272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 256.0,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "entropy": 0.24853120744228363,
      "epoch": 0.21551644279759147,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17795726656913757,
      "kl": 0.03794926730915904,
      "learning_rate": 9.569059749884205e-07,
      "loss": 0.0062,
      "num_tokens": 127827076.0,
      "reward": 0.9874951839447021,
      "reward_std": 0.01667311228811741,
      "rewards/reward_func/mean": 0.9874951839447021,
      "rewards/reward_func/std": 0.016673119738698006,
      "step": 4653,
      "step_time": 30.484981279820204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 213.375,
      "completions/mean_terminated_length": 213.375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.3825131356716156,
      "epoch": 0.2155627605372858,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009327673353254795,
      "kl": 0.008753576781600714,
      "learning_rate": 9.568967114404816e-07,
      "loss": 0.0004,
      "num_tokens": 127868458.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4654,
      "step_time": 28.964972458779812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 152.25,
      "completions/mean_terminated_length": 152.25,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.35802191495895386,
      "epoch": 0.2156090782769801,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005376492626965046,
      "kl": 0.0033036007080227137,
      "learning_rate": 9.568874478925427e-07,
      "loss": 0.0002,
      "num_tokens": 127899630.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4655,
      "step_time": 21.17877870053053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 361.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 305.4375,
      "completions/mean_terminated_length": 305.4375,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.26090652868151665,
      "epoch": 0.21565539601667438,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08595925569534302,
      "kl": 0.015238664811477065,
      "learning_rate": 9.56878184344604e-07,
      "loss": -0.1317,
      "num_tokens": 127931541.0,
      "reward": 0.8587601184844971,
      "reward_std": 0.3353709280490875,
      "rewards/reward_func/mean": 0.8587601184844971,
      "rewards/reward_func/std": 0.3353709578514099,
      "step": 4656,
      "step_time": 35.50366682559252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 284.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 223.8125,
      "completions/mean_terminated_length": 223.8125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "entropy": 0.362893283367157,
      "epoch": 0.21570171375636868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003356839530169964,
      "kl": 0.003706058021634817,
      "learning_rate": 9.56868920796665e-07,
      "loss": 0.0002,
      "num_tokens": 127974210.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4657,
      "step_time": 32.763096299022436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 138.9375,
      "completions/mean_terminated_length": 138.9375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.33154158294200897,
      "epoch": 0.215748031496063,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0057208905927836895,
      "kl": 0.0037041493924334645,
      "learning_rate": 9.568596572487261e-07,
      "loss": 0.0002,
      "num_tokens": 127996785.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4658,
      "step_time": 17.63434297591448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 119.0625,
      "completions/mean_terminated_length": 119.0625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.27619320899248123,
      "epoch": 0.2157943492357573,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006155842915177345,
      "kl": 0.005472356686368585,
      "learning_rate": 9.568503937007872e-07,
      "loss": 0.0003,
      "num_tokens": 128016338.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4659,
      "step_time": 16.408082224428654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 129.6875,
      "completions/mean_terminated_length": 129.6875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2596445754170418,
      "epoch": 0.2158406669754516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004416171461343765,
      "kl": 0.002276734303450212,
      "learning_rate": 9.568411301528486e-07,
      "loss": 0.0001,
      "num_tokens": 128037053.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4660,
      "step_time": 17.764114674180746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 161.3125,
      "completions/mean_terminated_length": 161.3125,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.3849439322948456,
      "epoch": 0.2158869847151459,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018807444721460342,
      "kl": 0.001989488024264574,
      "learning_rate": 9.568318666049097e-07,
      "loss": 0.0001,
      "num_tokens": 128082946.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4661,
      "step_time": 27.11596456170082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 133.25,
      "completions/mean_terminated_length": 133.25,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.2866143584251404,
      "epoch": 0.21593330245484021,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002465636469423771,
      "kl": 0.002389605826465413,
      "learning_rate": 9.568226030569708e-07,
      "loss": 0.0001,
      "num_tokens": 128104934.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4662,
      "step_time": 16.766618456691504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 147.625,
      "completions/mean_terminated_length": 147.625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 0.3677098676562309,
      "epoch": 0.2159796201945345,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005699924658983946,
      "kl": 0.003419053158722818,
      "learning_rate": 9.56813339509032e-07,
      "loss": 0.0002,
      "num_tokens": 128141856.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4663,
      "step_time": 21.845496512949467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 161.625,
      "completions/mean_terminated_length": 161.625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.38255808502435684,
      "epoch": 0.2160259379342288,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021539025008678436,
      "kl": 0.012102874228730798,
      "learning_rate": 9.56804075961093e-07,
      "loss": 0.0006,
      "num_tokens": 128163946.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4664,
      "step_time": 20.500273644924164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 130.625,
      "completions/mean_terminated_length": 130.625,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.38598378747701645,
      "epoch": 0.2160722556739231,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003358134301379323,
      "kl": 0.002884236047975719,
      "learning_rate": 9.567948124131542e-07,
      "loss": 0.0001,
      "num_tokens": 128184692.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4665,
      "step_time": 16.254790641367435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 207.8125,
      "completions/mean_terminated_length": 207.8125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.3718961030244827,
      "epoch": 0.21611857341361743,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1306808590888977,
      "kl": 0.007270547677762806,
      "learning_rate": 9.567855488652153e-07,
      "loss": 0.0207,
      "num_tokens": 128212289.0,
      "reward": 0.34135520458221436,
      "reward_std": 0.45313286781311035,
      "rewards/reward_func/mean": 0.34135520458221436,
      "rewards/reward_func/std": 0.45313286781311035,
      "step": 4666,
      "step_time": 29.924130588769913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 276.125,
      "completions/mean_terminated_length": 276.125,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "entropy": 0.17390716075897217,
      "epoch": 0.21616489115331172,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003358956892043352,
      "kl": 0.029748273082077503,
      "learning_rate": 9.567762853172765e-07,
      "loss": 0.0015,
      "num_tokens": 128237859.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 4667,
      "step_time": 28.623353756964207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 171.625,
      "completions/mean_terminated_length": 171.625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.1873151920735836,
      "epoch": 0.21621120889300602,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037431532982736826,
      "kl": 0.004099401994608343,
      "learning_rate": 9.567670217693376e-07,
      "loss": 0.0002,
      "num_tokens": 128260973.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4668,
      "step_time": 19.78599815070629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 180.625,
      "completions/mean_terminated_length": 180.625,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.18856868147850037,
      "epoch": 0.21625752663270031,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004834700841456652,
      "kl": 0.004626446811016649,
      "learning_rate": 9.567577582213987e-07,
      "loss": 0.0002,
      "num_tokens": 128292263.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4669,
      "step_time": 24.34179699793458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 138.25,
      "completions/mean_terminated_length": 138.25,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.3532930091023445,
      "epoch": 0.21630384437239464,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026294998824596405,
      "kl": 0.0024022431462071836,
      "learning_rate": 9.567484946734598e-07,
      "loss": 0.0001,
      "num_tokens": 128312539.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4670,
      "step_time": 17.001453924924135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 152.875,
      "completions/mean_terminated_length": 152.875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.30836206674575806,
      "epoch": 0.21635016211208893,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14689210057258606,
      "kl": 0.004777474270667881,
      "learning_rate": 9.56739231125521e-07,
      "loss": 0.0891,
      "num_tokens": 128333065.0,
      "reward": 0.7721847891807556,
      "reward_std": 0.3014300763607025,
      "rewards/reward_func/mean": 0.7721847891807556,
      "rewards/reward_func/std": 0.3014300763607025,
      "step": 4671,
      "step_time": 20.48503626137972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 117.875,
      "completions/mean_terminated_length": 117.875,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2691037245094776,
      "epoch": 0.21639647985178323,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002530246740207076,
      "kl": 0.00207692536059767,
      "learning_rate": 9.56729967577582e-07,
      "loss": 0.0001,
      "num_tokens": 128353751.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4672,
      "step_time": 17.23697617277503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 162.8125,
      "completions/mean_terminated_length": 162.8125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4040471464395523,
      "epoch": 0.21644279759147753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004170588217675686,
      "kl": 0.0028006028151139617,
      "learning_rate": 9.567207040296434e-07,
      "loss": 0.0001,
      "num_tokens": 128391508.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4673,
      "step_time": 24.134961064904928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 176.875,
      "completions/mean_terminated_length": 176.875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.41561467945575714,
      "epoch": 0.21648911533117185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0052163079380989075,
      "kl": 0.005093224346637726,
      "learning_rate": 9.567114404817045e-07,
      "loss": 0.0003,
      "num_tokens": 128429346.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4674,
      "step_time": 24.104380950331688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 203.3125,
      "completions/mean_terminated_length": 203.3125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.45514459162950516,
      "epoch": 0.21653543307086615,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008697996847331524,
      "kl": 0.006443565711379051,
      "learning_rate": 9.567021769337655e-07,
      "loss": 0.0003,
      "num_tokens": 128451703.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4675,
      "step_time": 25.268934823572636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 140.4375,
      "completions/mean_terminated_length": 140.4375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.17368808761239052,
      "epoch": 0.21658175081056044,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2149912714958191,
      "kl": 0.006797351874411106,
      "learning_rate": 9.566929133858268e-07,
      "loss": -0.0114,
      "num_tokens": 128472350.0,
      "reward": 0.9025907516479492,
      "reward_std": 0.04832848533987999,
      "rewards/reward_func/mean": 0.9025907516479492,
      "rewards/reward_func/std": 0.048328500241041183,
      "step": 4676,
      "step_time": 16.623182900249958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 196.125,
      "completions/mean_terminated_length": 196.125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.21596404165029526,
      "epoch": 0.21662806855025474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00794797483831644,
      "kl": 0.009435880696401,
      "learning_rate": 9.56683649837888e-07,
      "loss": 0.0005,
      "num_tokens": 128495216.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4677,
      "step_time": 23.499925259500742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 138.125,
      "completions/mean_terminated_length": 138.125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.294400118291378,
      "epoch": 0.21667438628994906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0044252327643334866,
      "kl": 0.002887769020162523,
      "learning_rate": 9.56674386289949e-07,
      "loss": 0.0001,
      "num_tokens": 128516962.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4678,
      "step_time": 16.759018633514643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 422.0,
      "completions/max_terminated_length": 422.0,
      "completions/mean_length": 392.6875,
      "completions/mean_terminated_length": 392.6875,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "entropy": 0.16387999802827835,
      "epoch": 0.21672070402964336,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.058388832956552505,
      "kl": 0.011111302766948938,
      "learning_rate": 9.566651227420102e-07,
      "loss": -0.0277,
      "num_tokens": 128553629.0,
      "reward": 0.9573274254798889,
      "reward_std": 0.05677105858922005,
      "rewards/reward_func/mean": 0.9573274254798889,
      "rewards/reward_func/std": 0.05677107349038124,
      "step": 4679,
      "step_time": 42.23031213134527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 457.0,
      "completions/max_terminated_length": 457.0,
      "completions/mean_length": 340.8125,
      "completions/mean_terminated_length": 340.8125,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "entropy": 0.23604802042245865,
      "epoch": 0.21676702176933765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07029406726360321,
      "kl": 0.007450093748047948,
      "learning_rate": 9.566558591940713e-07,
      "loss": -0.0874,
      "num_tokens": 128587290.0,
      "reward": 0.8636986613273621,
      "reward_std": 0.33853599429130554,
      "rewards/reward_func/mean": 0.8636986613273621,
      "rewards/reward_func/std": 0.33853599429130554,
      "step": 4680,
      "step_time": 44.18964160978794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 379.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 279.375,
      "completions/mean_terminated_length": 279.375,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "entropy": 0.39158811420202255,
      "epoch": 0.21681333950903195,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1147027537226677,
      "kl": 0.020142321474850178,
      "learning_rate": 9.566465956461324e-07,
      "loss": -0.1632,
      "num_tokens": 128624624.0,
      "reward": 0.5518031120300293,
      "reward_std": 0.5043283104896545,
      "rewards/reward_func/mean": 0.5518031120300293,
      "rewards/reward_func/std": 0.5043283104896545,
      "step": 4681,
      "step_time": 37.18413728475571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 124.375,
      "completions/mean_terminated_length": 124.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3182864338159561,
      "epoch": 0.21685965724872627,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038417920004576445,
      "kl": 0.0027310732402838767,
      "learning_rate": 9.566373320981935e-07,
      "loss": 0.0001,
      "num_tokens": 128645590.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4682,
      "step_time": 15.642555423080921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 163.6875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.21090726926922798,
      "epoch": 0.21690597498842057,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004298835527151823,
      "kl": 0.0026743889320641756,
      "learning_rate": 9.566280685502547e-07,
      "loss": 0.0001,
      "num_tokens": 128671601.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 4683,
      "step_time": 21.579754520207644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 206.625,
      "completions/mean_terminated_length": 206.625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.21174703538417816,
      "epoch": 0.21695229272811486,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009199468418955803,
      "kl": 0.006716641131788492,
      "learning_rate": 9.566188050023158e-07,
      "loss": 0.0003,
      "num_tokens": 128696443.0,
      "reward": 0.7044320702552795,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7044320702552795,
      "rewards/reward_func/std": 0.0,
      "step": 4684,
      "step_time": 24.04336714744568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 112.9375,
      "completions/mean_terminated_length": 112.9375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.27595193684101105,
      "epoch": 0.21699861046780916,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004088678862899542,
      "kl": 0.002328214410226792,
      "learning_rate": 9.56609541454377e-07,
      "loss": 0.0001,
      "num_tokens": 128716426.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4685,
      "step_time": 14.581633433699608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 123.875,
      "completions/mean_terminated_length": 123.875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2818344533443451,
      "epoch": 0.21704492820750348,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008714600466191769,
      "kl": 0.0047147831646725535,
      "learning_rate": 9.566002779064383e-07,
      "loss": 0.0002,
      "num_tokens": 128737080.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4686,
      "step_time": 15.111976612359285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 241.375,
      "completions/mean_terminated_length": 241.375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.22171951085329056,
      "epoch": 0.21709124594719778,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09821074455976486,
      "kl": 0.011160548659972847,
      "learning_rate": 9.565910143584994e-07,
      "loss": 0.0552,
      "num_tokens": 128764126.0,
      "reward": 0.9346067905426025,
      "reward_std": 0.04231547191739082,
      "rewards/reward_func/mean": 0.9346067905426025,
      "rewards/reward_func/std": 0.04231548309326172,
      "step": 4687,
      "step_time": 28.17325323075056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 152.0625,
      "completions/mean_terminated_length": 152.0625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.32022595405578613,
      "epoch": 0.21713756368689208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004113008733838797,
      "kl": 0.0030579391168430448,
      "learning_rate": 9.565817508105603e-07,
      "loss": 0.0002,
      "num_tokens": 128785695.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4688,
      "step_time": 18.334290079772472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.41011738777160645,
      "epoch": 0.21718388142658637,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00345940375700593,
      "kl": 0.0037335798260755837,
      "learning_rate": 9.565724872626214e-07,
      "loss": 0.0002,
      "num_tokens": 128809175.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4689,
      "step_time": 20.805212043225765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 136.8125,
      "completions/mean_terminated_length": 136.8125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.24847476184368134,
      "epoch": 0.2172301991662807,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014227625913918018,
      "kl": 0.008940773201175034,
      "learning_rate": 9.565632237146828e-07,
      "loss": 0.0005,
      "num_tokens": 128832100.0,
      "reward": 0.0006008694763295352,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0006008694763295352,
      "rewards/reward_func/std": 0.0,
      "step": 4690,
      "step_time": 17.96142216026783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 209.0,
      "completions/mean_terminated_length": 209.0,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.15977605804800987,
      "epoch": 0.217276516905975,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0043493714183568954,
      "kl": 0.010925154201686382,
      "learning_rate": 9.565539601667439e-07,
      "loss": 0.0005,
      "num_tokens": 128854196.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4691,
      "step_time": 23.239343974739313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 292.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 215.3125,
      "completions/mean_terminated_length": 215.3125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.44224586337804794,
      "epoch": 0.2173228346456693,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11012236773967743,
      "kl": 0.005374442785978317,
      "learning_rate": 9.56544696618805e-07,
      "loss": 0.0092,
      "num_tokens": 128877033.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4692,
      "step_time": 27.61267638579011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 155.5,
      "completions/mean_terminated_length": 155.5,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.3069450259208679,
      "epoch": 0.21736915238536358,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002614505821838975,
      "kl": 0.002206849807407707,
      "learning_rate": 9.565354330708661e-07,
      "loss": 0.0001,
      "num_tokens": 128908865.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4693,
      "step_time": 21.37566214054823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 176.875,
      "completions/mean_terminated_length": 176.875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.4276970997452736,
      "epoch": 0.2174154701250579,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021815637592226267,
      "kl": 0.002318186656339094,
      "learning_rate": 9.565261695229273e-07,
      "loss": 0.0001,
      "num_tokens": 128954271.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4694,
      "step_time": 26.854221165180206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 187.0,
      "completions/mean_terminated_length": 187.0,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.403320737183094,
      "epoch": 0.2174617878647522,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007326861843466759,
      "kl": 0.006120213191024959,
      "learning_rate": 9.565169059749884e-07,
      "loss": 0.0003,
      "num_tokens": 128979487.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4695,
      "step_time": 21.921999126672745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 110.9375,
      "completions/mean_terminated_length": 110.9375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.27059856057167053,
      "epoch": 0.2175081056044465,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009745532646775246,
      "kl": 0.0035272304667159915,
      "learning_rate": 9.565076424270495e-07,
      "loss": 0.0002,
      "num_tokens": 128999790.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4696,
      "step_time": 13.533142525702715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 123.0625,
      "completions/mean_terminated_length": 123.0625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2352730892598629,
      "epoch": 0.2175544233441408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002535650972276926,
      "kl": 0.0015813313657417893,
      "learning_rate": 9.564983788791106e-07,
      "loss": 0.0001,
      "num_tokens": 129019199.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4697,
      "step_time": 15.000992849469185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 195.6875,
      "completions/mean_terminated_length": 195.6875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.22550852969288826,
      "epoch": 0.21760074108383512,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10005852580070496,
      "kl": 0.05339275160804391,
      "learning_rate": 9.564891153311718e-07,
      "loss": 0.0397,
      "num_tokens": 129040570.0,
      "reward": 0.7305104732513428,
      "reward_std": 0.12194602191448212,
      "rewards/reward_func/mean": 0.7305104732513428,
      "rewards/reward_func/std": 0.12194604426622391,
      "step": 4698,
      "step_time": 22.246045541018248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 207.0625,
      "completions/mean_terminated_length": 207.0625,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.4338148683309555,
      "epoch": 0.21764705882352942,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00788185466080904,
      "kl": 0.005995885643642396,
      "learning_rate": 9.564798517832329e-07,
      "loss": 0.0003,
      "num_tokens": 129072107.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4699,
      "step_time": 27.115179523825645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 166.75,
      "completions/mean_terminated_length": 166.75,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.16094813868403435,
      "epoch": 0.2176933765632237,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12027033418416977,
      "kl": 0.003907462581992149,
      "learning_rate": 9.56470588235294e-07,
      "loss": 0.0094,
      "num_tokens": 129102935.0,
      "reward": 0.9616204500198364,
      "reward_std": 0.06865544617176056,
      "rewards/reward_func/mean": 0.9616204500198364,
      "rewards/reward_func/std": 0.06865545362234116,
      "step": 4700,
      "step_time": 22.509795740246773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 170.375,
      "completions/mean_terminated_length": 170.375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4316684827208519,
      "epoch": 0.217739694302918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002886262722313404,
      "kl": 0.002785950870020315,
      "learning_rate": 9.564613246873551e-07,
      "loss": 0.0001,
      "num_tokens": 129137389.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4701,
      "step_time": 22.80204911902547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 110.6875,
      "completions/mean_terminated_length": 110.6875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.264005608856678,
      "epoch": 0.21778601204261233,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042200335301458836,
      "kl": 0.0020503849955275655,
      "learning_rate": 9.564520611394163e-07,
      "loss": 0.0001,
      "num_tokens": 129159000.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4702,
      "step_time": 14.584453262388706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 205.875,
      "completions/mean_terminated_length": 205.875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.19852978736162186,
      "epoch": 0.21783232978230663,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09294604510068893,
      "kl": 0.012052624253556132,
      "learning_rate": 9.564427975914776e-07,
      "loss": -0.0466,
      "num_tokens": 129181542.0,
      "reward": 0.9442729949951172,
      "reward_std": 0.22290799021720886,
      "rewards/reward_func/mean": 0.9442729949951172,
      "rewards/reward_func/std": 0.22290800511837006,
      "step": 4703,
      "step_time": 24.002835299819708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 112.0,
      "completions/mean_terminated_length": 112.0,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 0.28455759584903717,
      "epoch": 0.21787864752200092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004972612485289574,
      "kl": 0.0028877685545012355,
      "learning_rate": 9.564335340435387e-07,
      "loss": 0.0001,
      "num_tokens": 129202326.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4704,
      "step_time": 16.196747165173292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 160.0625,
      "completions/mean_terminated_length": 160.0625,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.38754066824913025,
      "epoch": 0.21792496526169522,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002032277174293995,
      "kl": 0.0025206418649759144,
      "learning_rate": 9.564242704955998e-07,
      "loss": 0.0001,
      "num_tokens": 129252519.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4705,
      "step_time": 25.311698351055384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 150.0,
      "completions/max_terminated_length": 150.0,
      "completions/mean_length": 123.0625,
      "completions/mean_terminated_length": 123.0625,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.30276161432266235,
      "epoch": 0.21797128300138954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004128539469093084,
      "kl": 0.0028701364062726498,
      "learning_rate": 9.56415006947661e-07,
      "loss": 0.0001,
      "num_tokens": 129276728.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4706,
      "step_time": 16.46551175415516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 166.9375,
      "completions/mean_terminated_length": 166.9375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.41504184901714325,
      "epoch": 0.21801760074108384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027445501182228327,
      "kl": 0.002175698842620477,
      "learning_rate": 9.56405743399722e-07,
      "loss": 0.0001,
      "num_tokens": 129328999.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4707,
      "step_time": 27.024520061910152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 203.625,
      "completions/mean_terminated_length": 203.625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.4600290209054947,
      "epoch": 0.21806391848077814,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006384314503520727,
      "kl": 0.008535626577213407,
      "learning_rate": 9.563964798517832e-07,
      "loss": 0.0004,
      "num_tokens": 129362049.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4708,
      "step_time": 28.051227170974016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 133.1875,
      "completions/mean_terminated_length": 133.1875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.3612709268927574,
      "epoch": 0.21811023622047243,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002812664955854416,
      "kl": 0.0027132914983667433,
      "learning_rate": 9.563872163038443e-07,
      "loss": 0.0001,
      "num_tokens": 129384420.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4709,
      "step_time": 16.85134883597493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 205.9375,
      "completions/mean_terminated_length": 205.9375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.18895790353417397,
      "epoch": 0.21815655396016675,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0013839430175721645,
      "kl": 0.002374694449827075,
      "learning_rate": 9.563779527559055e-07,
      "loss": 0.0001,
      "num_tokens": 129407699.0,
      "reward": 0.8668779134750366,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8668779134750366,
      "rewards/reward_func/std": 0.0,
      "step": 4710,
      "step_time": 23.17830441892147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 155.0,
      "completions/max_terminated_length": 155.0,
      "completions/mean_length": 134.375,
      "completions/mean_terminated_length": 134.375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2579244151711464,
      "epoch": 0.21820287169986105,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005870082415640354,
      "kl": 0.0031024435884319246,
      "learning_rate": 9.563686892079666e-07,
      "loss": 0.0002,
      "num_tokens": 129427433.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4711,
      "step_time": 16.59689372777939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 115.8125,
      "completions/mean_terminated_length": 115.8125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.28431785851716995,
      "epoch": 0.21824918943955535,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003787873312830925,
      "kl": 0.0024733886239118874,
      "learning_rate": 9.563594256600277e-07,
      "loss": 0.0001,
      "num_tokens": 129447542.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4712,
      "step_time": 14.652054857462645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 247.0,
      "completions/max_terminated_length": 247.0,
      "completions/mean_length": 198.0,
      "completions/mean_terminated_length": 198.0,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.40155162662267685,
      "epoch": 0.21829550717924964,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16008706390857697,
      "kl": 0.00958009366877377,
      "learning_rate": 9.563501621120888e-07,
      "loss": 0.0182,
      "num_tokens": 129471782.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4713,
      "step_time": 25.589435674250126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 171.6875,
      "completions/mean_terminated_length": 171.6875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.39728278666734695,
      "epoch": 0.21834182491894397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005508288275450468,
      "kl": 0.004464473749976605,
      "learning_rate": 9.5634089856415e-07,
      "loss": 0.0002,
      "num_tokens": 129500737.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4714,
      "step_time": 22.905235670506954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 223.0,
      "completions/max_terminated_length": 223.0,
      "completions/mean_length": 184.5625,
      "completions/mean_terminated_length": 184.5625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.34112362563610077,
      "epoch": 0.21838814265863826,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12135380506515503,
      "kl": 0.016973228193819523,
      "learning_rate": 9.56331635016211e-07,
      "loss": -0.0496,
      "num_tokens": 129523018.0,
      "reward": 0.7375502586364746,
      "reward_std": 0.3552190959453583,
      "rewards/reward_func/mean": 0.7375502586364746,
      "rewards/reward_func/std": 0.3552190959453583,
      "step": 4715,
      "step_time": 22.028970792889595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 131.1875,
      "completions/mean_terminated_length": 131.1875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.31367379426956177,
      "epoch": 0.21843446039833256,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005986443720757961,
      "kl": 0.0048665719805285335,
      "learning_rate": 9.563223714682724e-07,
      "loss": 0.0002,
      "num_tokens": 129543005.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4716,
      "step_time": 15.876225415617228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 328.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 232.8125,
      "completions/mean_terminated_length": 232.8125,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.5032502710819244,
      "epoch": 0.21848077813802685,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13312603533267975,
      "kl": 0.007286880747415125,
      "learning_rate": 9.563131079203336e-07,
      "loss": 0.076,
      "num_tokens": 129566202.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4717,
      "step_time": 30.825655966997147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 136.375,
      "completions/mean_terminated_length": 136.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.2676912546157837,
      "epoch": 0.21852709587772118,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0040293484926223755,
      "kl": 0.0029119584360159934,
      "learning_rate": 9.563038443723945e-07,
      "loss": 0.0001,
      "num_tokens": 129588176.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4718,
      "step_time": 16.30700771510601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 160.8125,
      "completions/mean_terminated_length": 160.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.15503466874361038,
      "epoch": 0.21857341361741547,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002281376626342535,
      "kl": 0.0025667650625109673,
      "learning_rate": 9.562945808244556e-07,
      "loss": 0.0001,
      "num_tokens": 129612781.0,
      "reward": 0.1353352814912796,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.1353352814912796,
      "rewards/reward_func/std": 0.0,
      "step": 4719,
      "step_time": 18.35327873378992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 129.0,
      "completions/max_terminated_length": 129.0,
      "completions/mean_length": 107.875,
      "completions/mean_terminated_length": 107.875,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 0.24697128683328629,
      "epoch": 0.21861973135710977,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030700985807925463,
      "kl": 0.002165102807339281,
      "learning_rate": 9.56285317276517e-07,
      "loss": 0.0001,
      "num_tokens": 129632123.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4720,
      "step_time": 13.440684881061316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 162.6875,
      "completions/mean_terminated_length": 162.6875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.38421468436717987,
      "epoch": 0.21866604909680407,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033809831365942955,
      "kl": 0.002879140607547015,
      "learning_rate": 9.56276053728578e-07,
      "loss": 0.0001,
      "num_tokens": 129692486.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4721,
      "step_time": 30.075780019164085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 137.0,
      "completions/max_terminated_length": 137.0,
      "completions/mean_length": 117.375,
      "completions/mean_terminated_length": 117.375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.26094746217131615,
      "epoch": 0.2187123668364984,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005386655684560537,
      "kl": 0.003322462609503418,
      "learning_rate": 9.562667901806392e-07,
      "loss": 0.0002,
      "num_tokens": 129714252.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4722,
      "step_time": 15.187982801347971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 255.0,
      "completions/max_terminated_length": 255.0,
      "completions/mean_length": 231.5,
      "completions/mean_terminated_length": 231.5,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "entropy": 0.2107616625726223,
      "epoch": 0.21875868457619269,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0058716400526463985,
      "kl": 0.00631087610963732,
      "learning_rate": 9.562575266327003e-07,
      "loss": 0.0003,
      "num_tokens": 129752628.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4723,
      "step_time": 32.07692110911012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 144.3125,
      "completions/mean_terminated_length": 144.3125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.28818415850400925,
      "epoch": 0.21880500231588698,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030706559773534536,
      "kl": 0.0022004097991157323,
      "learning_rate": 9.562482630847614e-07,
      "loss": 0.0001,
      "num_tokens": 129773161.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4724,
      "step_time": 17.998664502054453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 163.3125,
      "completions/mean_terminated_length": 163.3125,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.4172137379646301,
      "epoch": 0.21885132005558128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025425860658288,
      "kl": 0.0025123218656517565,
      "learning_rate": 9.562389995368225e-07,
      "loss": 0.0001,
      "num_tokens": 129824510.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4725,
      "step_time": 27.130378130823374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 120.1875,
      "completions/mean_terminated_length": 120.1875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2462136559188366,
      "epoch": 0.2188976377952756,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003374190768226981,
      "kl": 0.0016968742129392922,
      "learning_rate": 9.562297359888837e-07,
      "loss": 0.0001,
      "num_tokens": 129844497.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4726,
      "step_time": 16.39966733008623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 160.5625,
      "completions/mean_terminated_length": 160.5625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.20630601793527603,
      "epoch": 0.2189439555349699,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005442303605377674,
      "kl": 0.003976957174018025,
      "learning_rate": 9.562204724409448e-07,
      "loss": 0.0002,
      "num_tokens": 129865194.0,
      "reward": 0.904837429523468,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.904837429523468,
      "rewards/reward_func/std": 0.0,
      "step": 4727,
      "step_time": 19.599647972732782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 132.0,
      "completions/max_terminated_length": 132.0,
      "completions/mean_length": 127.875,
      "completions/mean_terminated_length": 127.875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.1109622921794653,
      "epoch": 0.2189902732746642,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014203027822077274,
      "kl": 0.0011672762630041689,
      "learning_rate": 9.56211208893006e-07,
      "loss": 0.0001,
      "num_tokens": 129887160.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 4728,
      "step_time": 15.035693380981684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 123.375,
      "completions/mean_terminated_length": 123.375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.26559437066316605,
      "epoch": 0.2190365910143585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002877059392631054,
      "kl": 0.0023826081887818873,
      "learning_rate": 9.56201945345067e-07,
      "loss": 0.0001,
      "num_tokens": 129910318.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4729,
      "step_time": 15.939006235450506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 156.625,
      "completions/mean_terminated_length": 156.625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.24844319373369217,
      "epoch": 0.2190829087540528,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.022856680676341057,
      "kl": 0.014928898774087429,
      "learning_rate": 9.561926817971284e-07,
      "loss": 0.0007,
      "num_tokens": 129947080.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4730,
      "step_time": 25.66942211985588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 164.625,
      "completions/mean_terminated_length": 164.625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.35775817930698395,
      "epoch": 0.2191292264937471,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023563746362924576,
      "kl": 0.002303392451722175,
      "learning_rate": 9.561834182491893e-07,
      "loss": 0.0001,
      "num_tokens": 129977858.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4731,
      "step_time": 20.681682847440243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 125.3125,
      "completions/mean_terminated_length": 125.3125,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.23523488640785217,
      "epoch": 0.2191755442334414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005492073483765125,
      "kl": 0.003430097654927522,
      "learning_rate": 9.561741547012504e-07,
      "loss": 0.0002,
      "num_tokens": 129997815.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4732,
      "step_time": 15.587875299155712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 183.9375,
      "completions/mean_terminated_length": 183.9375,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.2596520856022835,
      "epoch": 0.2192218619731357,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001385444076731801,
      "kl": 0.0012672853044932708,
      "learning_rate": 9.561648911533118e-07,
      "loss": 0.0001,
      "num_tokens": 130052310.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4733,
      "step_time": 28.772629994899035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 257.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 223.1875,
      "completions/mean_terminated_length": 223.1875,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "entropy": 0.256424181163311,
      "epoch": 0.21926817971283002,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12284564971923828,
      "kl": 0.011806728085502982,
      "learning_rate": 9.561556276053729e-07,
      "loss": 0.0019,
      "num_tokens": 130075881.0,
      "reward": 0.9938837289810181,
      "reward_std": 0.016712799668312073,
      "rewards/reward_func/mean": 0.9938837289810181,
      "rewards/reward_func/std": 0.016712794080376625,
      "step": 4734,
      "step_time": 24.939057677984238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 191.25,
      "completions/mean_terminated_length": 191.25,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.23606186732649803,
      "epoch": 0.21931449745252432,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11030472069978714,
      "kl": 0.00840791326481849,
      "learning_rate": 9.56146364057434e-07,
      "loss": -0.0696,
      "num_tokens": 130103517.0,
      "reward": 0.9772799015045166,
      "reward_std": 0.03029346466064453,
      "rewards/reward_func/mean": 0.9772799015045166,
      "rewards/reward_func/std": 0.03029346466064453,
      "step": 4735,
      "step_time": 23.88426313176751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 143.0,
      "completions/max_terminated_length": 143.0,
      "completions/mean_length": 128.375,
      "completions/mean_terminated_length": 128.375,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.13496162928640842,
      "epoch": 0.21936081519221862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005121160298585892,
      "kl": 0.003617467125877738,
      "learning_rate": 9.561371005094951e-07,
      "loss": 0.0002,
      "num_tokens": 130128099.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4736,
      "step_time": 16.25461396574974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 155.4375,
      "completions/mean_terminated_length": 155.4375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3519606217741966,
      "epoch": 0.2194071329319129,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004885755944997072,
      "kl": 0.0037717887898907065,
      "learning_rate": 9.561278369615563e-07,
      "loss": 0.0002,
      "num_tokens": 130156138.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4737,
      "step_time": 20.14709546044469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 198.0,
      "completions/mean_terminated_length": 198.0,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4937119781970978,
      "epoch": 0.21945345067160724,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007549159228801727,
      "kl": 0.0056273117661476135,
      "learning_rate": 9.561185734136174e-07,
      "loss": 0.0003,
      "num_tokens": 130183738.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4738,
      "step_time": 24.763157915323973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 180.9375,
      "completions/mean_terminated_length": 180.9375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.4217841774225235,
      "epoch": 0.21949976841130153,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002368347719311714,
      "kl": 0.002506342134438455,
      "learning_rate": 9.561093098656785e-07,
      "loss": 0.0001,
      "num_tokens": 130224489.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4739,
      "step_time": 26.37431424111128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 178.0625,
      "completions/mean_terminated_length": 178.0625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.361618809401989,
      "epoch": 0.21954608615099583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009282790124416351,
      "kl": 0.008149751694872975,
      "learning_rate": 9.561000463177396e-07,
      "loss": 0.0004,
      "num_tokens": 130249818.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4740,
      "step_time": 25.666003689169884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 133.0625,
      "completions/mean_terminated_length": 133.0625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.23081785067915916,
      "epoch": 0.21959240389069012,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004358035046607256,
      "kl": 0.0024257359909825027,
      "learning_rate": 9.560907827698008e-07,
      "loss": 0.0001,
      "num_tokens": 130269371.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4741,
      "step_time": 17.47803793102503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 124.4375,
      "completions/mean_terminated_length": 124.4375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.2911137491464615,
      "epoch": 0.21963872163038445,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01487666554749012,
      "kl": 0.006362575339153409,
      "learning_rate": 9.560815192218619e-07,
      "loss": 0.0003,
      "num_tokens": 130292002.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4742,
      "step_time": 17.08535874634981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 161.375,
      "completions/mean_terminated_length": 161.375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.38935375958681107,
      "epoch": 0.21968503937007874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002340888138860464,
      "kl": 0.0022539437632076442,
      "learning_rate": 9.56072255673923e-07,
      "loss": 0.0001,
      "num_tokens": 130342776.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4743,
      "step_time": 27.16944271698594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 145.5,
      "completions/mean_terminated_length": 145.5,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.327069990336895,
      "epoch": 0.21973135710977304,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003524728585034609,
      "kl": 0.0026839629281312227,
      "learning_rate": 9.560629921259841e-07,
      "loss": 0.0001,
      "num_tokens": 130362880.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4744,
      "step_time": 16.383930154144764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.39243703335523605,
      "epoch": 0.21977767484946734,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003960326313972473,
      "kl": 0.0033166881185024977,
      "learning_rate": 9.560537285780453e-07,
      "loss": 0.0002,
      "num_tokens": 130391338.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4745,
      "step_time": 20.206228274852037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 205.125,
      "completions/mean_terminated_length": 205.125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.4340764209628105,
      "epoch": 0.21982399258916166,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10383335500955582,
      "kl": 0.011215093079954386,
      "learning_rate": 9.560444650301066e-07,
      "loss": -0.1166,
      "num_tokens": 130419948.0,
      "reward": 0.4018140435218811,
      "reward_std": 0.4711059331893921,
      "rewards/reward_func/mean": 0.4018140435218811,
      "rewards/reward_func/std": 0.47110599279403687,
      "step": 4746,
      "step_time": 29.4142703153193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 124.0,
      "completions/max_terminated_length": 124.0,
      "completions/mean_length": 108.3125,
      "completions/mean_terminated_length": 108.3125,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 0.2552882730960846,
      "epoch": 0.21987031032885596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025769576895982027,
      "kl": 0.0020312996639404446,
      "learning_rate": 9.560352014821677e-07,
      "loss": 0.0001,
      "num_tokens": 130440225.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4747,
      "step_time": 13.208584818989038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 180.0625,
      "completions/mean_terminated_length": 180.0625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.3985041454434395,
      "epoch": 0.21991662806855025,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022527764085680246,
      "kl": 0.0023291026591323316,
      "learning_rate": 9.560259379342288e-07,
      "loss": 0.0001,
      "num_tokens": 130489458.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4748,
      "step_time": 24.394183471798897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 136.3125,
      "completions/mean_terminated_length": 136.3125,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 0.27034758776426315,
      "epoch": 0.21996294580824455,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005365348886698484,
      "kl": 0.003320941119454801,
      "learning_rate": 9.560166743862898e-07,
      "loss": 0.0002,
      "num_tokens": 130525575.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4749,
      "step_time": 18.391599718481302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 125.4375,
      "completions/mean_terminated_length": 125.4375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.33369433879852295,
      "epoch": 0.22000926354793887,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0039688884280622005,
      "kl": 0.0027712912997230887,
      "learning_rate": 9.56007410838351e-07,
      "loss": 0.0001,
      "num_tokens": 130552526.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4750,
      "step_time": 15.59014567732811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 146.9375,
      "completions/mean_terminated_length": 146.9375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.28914664685726166,
      "epoch": 0.22005558128763317,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00288494979031384,
      "kl": 0.0020578180556185544,
      "learning_rate": 9.559981472904122e-07,
      "loss": 0.0001,
      "num_tokens": 130588653.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4751,
      "step_time": 20.300949413329363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 352.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 244.0625,
      "completions/mean_terminated_length": 244.0625,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.31658507138490677,
      "epoch": 0.22010189902732746,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0900760367512703,
      "kl": 0.02131354482844472,
      "learning_rate": 9.559888837424733e-07,
      "loss": -0.1653,
      "num_tokens": 130626654.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 4752,
      "step_time": 32.26984718814492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 186.0625,
      "completions/mean_terminated_length": 186.0625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3751211538910866,
      "epoch": 0.22014821676702176,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029134678188711405,
      "kl": 0.0027626954251900315,
      "learning_rate": 9.559796201945345e-07,
      "loss": 0.0001,
      "num_tokens": 130680063.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4753,
      "step_time": 26.41571592912078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 226.875,
      "completions/mean_terminated_length": 226.875,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "entropy": 0.3934357687830925,
      "epoch": 0.22019453450671608,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09501536935567856,
      "kl": 0.016132954973727465,
      "learning_rate": 9.559703566465956e-07,
      "loss": 0.0564,
      "num_tokens": 130704269.0,
      "reward": 0.5964052677154541,
      "reward_std": 0.43766361474990845,
      "rewards/reward_func/mean": 0.5964052677154541,
      "rewards/reward_func/std": 0.43766364455223083,
      "step": 4754,
      "step_time": 25.017730347812176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 126.125,
      "completions/mean_terminated_length": 126.125,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3438038304448128,
      "epoch": 0.22024085224641038,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003388928947970271,
      "kl": 0.002402106241788715,
      "learning_rate": 9.559610930986567e-07,
      "loss": 0.0001,
      "num_tokens": 130725583.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4755,
      "step_time": 13.71153750270605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 190.9375,
      "completions/mean_terminated_length": 190.9375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.2335614301264286,
      "epoch": 0.22028716998610468,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005704137030988932,
      "kl": 0.03125150501728058,
      "learning_rate": 9.559518295507178e-07,
      "loss": 0.0016,
      "num_tokens": 130750430.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4756,
      "step_time": 18.895425386726856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 177.9375,
      "completions/mean_terminated_length": 177.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3949665054678917,
      "epoch": 0.22033348772579897,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005754759069532156,
      "kl": 0.004841367830522358,
      "learning_rate": 9.55942566002779e-07,
      "loss": 0.0002,
      "num_tokens": 130774621.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4757,
      "step_time": 19.492938246577978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 187.5,
      "completions/mean_terminated_length": 187.5,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.4030352756381035,
      "epoch": 0.2203798054654933,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005534633528441191,
      "kl": 0.0049605792155489326,
      "learning_rate": 9.5593330245484e-07,
      "loss": 0.0002,
      "num_tokens": 130797157.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4758,
      "step_time": 23.59635440632701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 140.0,
      "completions/mean_terminated_length": 140.0,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.31023237109184265,
      "epoch": 0.2204261232051876,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002594081684947014,
      "kl": 0.002225446922238916,
      "learning_rate": 9.559240389069012e-07,
      "loss": 0.0001,
      "num_tokens": 130822741.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4759,
      "step_time": 16.06799967214465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 143.0,
      "completions/mean_terminated_length": 143.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.34480416774749756,
      "epoch": 0.2204724409448819,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004018284380435944,
      "kl": 0.00251045823097229,
      "learning_rate": 9.559147753589626e-07,
      "loss": 0.0001,
      "num_tokens": 130845301.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4760,
      "step_time": 15.60106673464179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 178.0,
      "completions/mean_terminated_length": 178.0,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.18607999756932259,
      "epoch": 0.22051875868457618,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11738462001085281,
      "kl": 0.003470874042250216,
      "learning_rate": 9.559055118110235e-07,
      "loss": 0.0343,
      "num_tokens": 130891829.0,
      "reward": 0.9041609764099121,
      "reward_std": 0.0012101116590201855,
      "rewards/reward_func/mean": 0.9041609764099121,
      "rewards/reward_func/std": 0.0012101028114557266,
      "step": 4761,
      "step_time": 25.300740618258715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 189.1875,
      "completions/mean_terminated_length": 189.1875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.312733419239521,
      "epoch": 0.2205650764242705,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12185503542423248,
      "kl": 0.01029410946648568,
      "learning_rate": 9.558962482630846e-07,
      "loss": -0.1181,
      "num_tokens": 130913176.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 4762,
      "step_time": 22.558964394032955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 127.5,
      "completions/mean_terminated_length": 127.5,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.3007870614528656,
      "epoch": 0.2206113941639648,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004395459312945604,
      "kl": 0.00244244173518382,
      "learning_rate": 9.55886984715146e-07,
      "loss": 0.0001,
      "num_tokens": 130932896.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4763,
      "step_time": 16.356438282877207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 182.375,
      "completions/mean_terminated_length": 182.375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3589734584093094,
      "epoch": 0.2206577119036591,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00452509056776762,
      "kl": 0.00330969673814252,
      "learning_rate": 9.55877721167207e-07,
      "loss": 0.0002,
      "num_tokens": 130959110.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4764,
      "step_time": 20.84293807297945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 446.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 224.4375,
      "completions/mean_terminated_length": 224.4375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.4439801424741745,
      "epoch": 0.2207040296433534,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10928815603256226,
      "kl": 0.0056369376834481955,
      "learning_rate": 9.558684576192682e-07,
      "loss": 0.227,
      "num_tokens": 130981293.0,
      "reward": 0.875,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4765,
      "step_time": 35.4761412255466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 113.875,
      "completions/mean_terminated_length": 113.875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.30517885088920593,
      "epoch": 0.22075034738304772,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023938727099448442,
      "kl": 0.001815505907870829,
      "learning_rate": 9.558591940713293e-07,
      "loss": 0.0001,
      "num_tokens": 131002379.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4766,
      "step_time": 14.039244782179594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 201.25,
      "completions/mean_terminated_length": 201.25,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.22106963396072388,
      "epoch": 0.22079666512274201,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17539939284324646,
      "kl": 0.011657172464765608,
      "learning_rate": 9.558499305233904e-07,
      "loss": -0.0224,
      "num_tokens": 131026671.0,
      "reward": 0.9669851064682007,
      "reward_std": 0.012887682765722275,
      "rewards/reward_func/mean": 0.9669851064682007,
      "rewards/reward_func/std": 0.012887690216302872,
      "step": 4767,
      "step_time": 20.48035392165184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 132.125,
      "completions/mean_terminated_length": 132.125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2446160390973091,
      "epoch": 0.2208429828624363,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002266390947625041,
      "kl": 0.0019007090595550835,
      "learning_rate": 9.558406669754516e-07,
      "loss": 0.0001,
      "num_tokens": 131046177.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4768,
      "step_time": 14.428436130285263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 179.0,
      "completions/mean_terminated_length": 179.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.372202567756176,
      "epoch": 0.2208893006021306,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038536088541150093,
      "kl": 0.0025483937351964414,
      "learning_rate": 9.558314034275127e-07,
      "loss": 0.0001,
      "num_tokens": 131072321.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4769,
      "step_time": 19.525206457823515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 145.375,
      "completions/mean_terminated_length": 145.375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.13607894256711006,
      "epoch": 0.22093561834182493,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031790786888450384,
      "kl": 0.021630794275552034,
      "learning_rate": 9.558221398795738e-07,
      "loss": 0.0011,
      "num_tokens": 131103911.0,
      "reward": 0.7292129397392273,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7292129397392273,
      "rewards/reward_func/std": 0.0,
      "step": 4770,
      "step_time": 18.31699935719371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 355.0,
      "completions/max_terminated_length": 355.0,
      "completions/mean_length": 291.1875,
      "completions/mean_terminated_length": 291.1875,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "entropy": 0.22177626937627792,
      "epoch": 0.22098193608151923,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07899122685194016,
      "kl": 0.007844890584237874,
      "learning_rate": 9.55812876331635e-07,
      "loss": -0.0933,
      "num_tokens": 131129674.0,
      "reward": 0.9250829219818115,
      "reward_std": 0.24668878316879272,
      "rewards/reward_func/mean": 0.9250829219818115,
      "rewards/reward_func/std": 0.24668878316879272,
      "step": 4771,
      "step_time": 29.23802850022912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 136.75,
      "completions/mean_terminated_length": 136.75,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.26854807138442993,
      "epoch": 0.22102825382121352,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004177852068096399,
      "kl": 0.0025815270200837404,
      "learning_rate": 9.55803612783696e-07,
      "loss": 0.0001,
      "num_tokens": 131151574.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4772,
      "step_time": 14.716210912913084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 181.3125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 0.40225064009428024,
      "epoch": 0.22107457156090782,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005004691891372204,
      "kl": 0.003686443087644875,
      "learning_rate": 9.557943492357574e-07,
      "loss": 0.0002,
      "num_tokens": 131177611.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4773,
      "step_time": 20.878033470362425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 111.1875,
      "completions/mean_terminated_length": 111.1875,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "entropy": 0.2879294231534004,
      "epoch": 0.22112088930060214,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004238381050527096,
      "kl": 0.002292108372785151,
      "learning_rate": 9.557850856878183e-07,
      "loss": 0.0001,
      "num_tokens": 131199086.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4774,
      "step_time": 13.041929300874472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 228.375,
      "completions/mean_terminated_length": 228.375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.3509957119822502,
      "epoch": 0.22116720704029644,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11466275155544281,
      "kl": 0.02688826760277152,
      "learning_rate": 9.557758221398794e-07,
      "loss": -0.0845,
      "num_tokens": 131224804.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 4775,
      "step_time": 25.629514146596193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 271.125,
      "completions/mean_terminated_length": 271.125,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "entropy": 0.2786107286810875,
      "epoch": 0.22121352477999073,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08454053103923798,
      "kl": 0.010854382766410708,
      "learning_rate": 9.557665585919408e-07,
      "loss": -0.0484,
      "num_tokens": 131260614.0,
      "reward": 0.7811205387115479,
      "reward_std": 0.20829880237579346,
      "rewards/reward_func/mean": 0.7811205387115479,
      "rewards/reward_func/std": 0.20829881727695465,
      "step": 4776,
      "step_time": 29.490225601941347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 439.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 322.3125,
      "completions/mean_terminated_length": 322.3125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.34080687165260315,
      "epoch": 0.22125984251968503,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07830939441919327,
      "kl": 0.023424079176038504,
      "learning_rate": 9.557572950440019e-07,
      "loss": -0.189,
      "num_tokens": 131294955.0,
      "reward": 0.46473217010498047,
      "reward_std": 0.38188716769218445,
      "rewards/reward_func/mean": 0.46473217010498047,
      "rewards/reward_func/std": 0.38188719749450684,
      "step": 4777,
      "step_time": 36.8105660751462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 253.125,
      "completions/mean_terminated_length": 253.125,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "entropy": 0.21230966225266457,
      "epoch": 0.22130616025937935,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002865222282707691,
      "kl": 0.0025091485294979066,
      "learning_rate": 9.55748031496063e-07,
      "loss": 0.0001,
      "num_tokens": 131333757.0,
      "reward": 0.8111499547958374,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8111499547958374,
      "rewards/reward_func/std": 0.0,
      "step": 4778,
      "step_time": 26.44652072712779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 178.9375,
      "completions/mean_terminated_length": 178.9375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 0.38234594464302063,
      "epoch": 0.22135247799907365,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.025664258748292923,
      "kl": 0.01316279056482017,
      "learning_rate": 9.557387679481241e-07,
      "loss": 0.0007,
      "num_tokens": 131363308.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4779,
      "step_time": 21.489420641213655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 133.1875,
      "completions/mean_terminated_length": 133.1875,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.2646244913339615,
      "epoch": 0.22139879573876795,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005343126133084297,
      "kl": 0.0025474660797044635,
      "learning_rate": 9.557295044001853e-07,
      "loss": 0.0001,
      "num_tokens": 131399071.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4780,
      "step_time": 18.119741652160883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 155.375,
      "completions/mean_terminated_length": 155.375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4187124893069267,
      "epoch": 0.22144511347846224,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029853687155991793,
      "kl": 0.002423883182927966,
      "learning_rate": 9.557202408522464e-07,
      "loss": 0.0001,
      "num_tokens": 131428501.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4781,
      "step_time": 18.146856732666492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 180.5,
      "completions/mean_terminated_length": 180.5,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.253369003534317,
      "epoch": 0.22149143121815656,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11933569610118866,
      "kl": 0.009504554560407996,
      "learning_rate": 9.557109773043075e-07,
      "loss": -0.0532,
      "num_tokens": 131449309.0,
      "reward": 0.6608119010925293,
      "reward_std": 0.0635676234960556,
      "rewards/reward_func/mean": 0.6608119010925293,
      "rewards/reward_func/std": 0.0635676234960556,
      "step": 4782,
      "step_time": 18.973564580082893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 121.625,
      "completions/mean_terminated_length": 121.625,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.3002912551164627,
      "epoch": 0.22153774895785086,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0045975628308951855,
      "kl": 0.002598756196675822,
      "learning_rate": 9.557017137563686e-07,
      "loss": 0.0001,
      "num_tokens": 131469575.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4783,
      "step_time": 14.004752777516842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.24681223556399345,
      "epoch": 0.22158406669754516,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032644090242683887,
      "kl": 0.0022140414803288877,
      "learning_rate": 9.556924502084298e-07,
      "loss": 0.0001,
      "num_tokens": 131494540.0,
      "reward": 0.9459594488143921,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9459594488143921,
      "rewards/reward_func/std": 0.0,
      "step": 4784,
      "step_time": 17.667529467493296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 203.125,
      "completions/mean_terminated_length": 203.125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.2953430190682411,
      "epoch": 0.22163038443723945,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10740058124065399,
      "kl": 0.03867760160937905,
      "learning_rate": 9.556831866604909e-07,
      "loss": -0.0442,
      "num_tokens": 131517790.0,
      "reward": 0.5540241599082947,
      "reward_std": 0.27742329239845276,
      "rewards/reward_func/mean": 0.5540241599082947,
      "rewards/reward_func/std": 0.27742329239845276,
      "step": 4785,
      "step_time": 24.566777862608433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 117.1875,
      "completions/mean_terminated_length": 117.1875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.29800402373075485,
      "epoch": 0.22167670217693378,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002167989732697606,
      "kl": 0.0017821225337684155,
      "learning_rate": 9.55673923112552e-07,
      "loss": 0.0001,
      "num_tokens": 131540881.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4786,
      "step_time": 13.696676794439554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 178.625,
      "completions/mean_terminated_length": 178.625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.20346342399716377,
      "epoch": 0.22172301991662807,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12080593407154083,
      "kl": 0.03451876435428858,
      "learning_rate": 9.556646595646131e-07,
      "loss": -0.0104,
      "num_tokens": 131563147.0,
      "reward": 0.9727948904037476,
      "reward_std": 0.10882046073675156,
      "rewards/reward_func/mean": 0.9727948904037476,
      "rewards/reward_func/std": 0.10882046818733215,
      "step": 4787,
      "step_time": 18.88577552884817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 137.125,
      "completions/mean_terminated_length": 137.125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.2903420254588127,
      "epoch": 0.22176933765632237,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019144221441820264,
      "kl": 0.0015364754654001445,
      "learning_rate": 9.556553960166743e-07,
      "loss": 0.0001,
      "num_tokens": 131585389.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4788,
      "step_time": 15.55664437264204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 122.5625,
      "completions/mean_terminated_length": 122.5625,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "entropy": 0.2679368704557419,
      "epoch": 0.22181565539601666,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006853099446743727,
      "kl": 0.00363956147339195,
      "learning_rate": 9.556461324687354e-07,
      "loss": 0.0002,
      "num_tokens": 131608550.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4789,
      "step_time": 14.307089641690254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 177.4375,
      "completions/mean_terminated_length": 177.4375,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "entropy": 0.17491300031542778,
      "epoch": 0.221861973135711,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001836139359511435,
      "kl": 0.0013805674971081316,
      "learning_rate": 9.556368689207967e-07,
      "loss": 0.0001,
      "num_tokens": 131645853.0,
      "reward": 0.8914703726768494,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8914703726768494,
      "rewards/reward_func/std": 0.0,
      "step": 4790,
      "step_time": 21.233795523643494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 215.75,
      "completions/mean_terminated_length": 215.75,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.241499662399292,
      "epoch": 0.22190829087540528,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0076012942008674145,
      "kl": 0.009055163012817502,
      "learning_rate": 9.556276053728579e-07,
      "loss": 0.0004,
      "num_tokens": 131677257.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4791,
      "step_time": 25.706017028540373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 119.5,
      "completions/mean_terminated_length": 119.5,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.25134844332933426,
      "epoch": 0.22195460861509958,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002584259258583188,
      "kl": 0.002078727964544669,
      "learning_rate": 9.556183418249188e-07,
      "loss": 0.0001,
      "num_tokens": 131698961.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4792,
      "step_time": 15.152140188962221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 178.125,
      "completions/mean_terminated_length": 178.125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.36989825963974,
      "epoch": 0.22200092635479388,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11231192946434021,
      "kl": 0.021663016639649868,
      "learning_rate": 9.5560907827698e-07,
      "loss": 0.0742,
      "num_tokens": 131720243.0,
      "reward": 0.625,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5,
      "step": 4793,
      "step_time": 21.492061279714108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 204.875,
      "completions/mean_terminated_length": 204.875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "entropy": 0.37613045424222946,
      "epoch": 0.2220472440944882,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004512382671236992,
      "kl": 0.0037859281874261796,
      "learning_rate": 9.555998147290412e-07,
      "loss": 0.0002,
      "num_tokens": 131744785.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4794,
      "step_time": 22.01899578794837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 248.5625,
      "completions/mean_terminated_length": 248.5625,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "entropy": 0.23357480764389038,
      "epoch": 0.2220935618341825,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11967083811759949,
      "kl": 0.021958988159894943,
      "learning_rate": 9.555905511811024e-07,
      "loss": -0.1141,
      "num_tokens": 131783722.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4795,
      "step_time": 28.449336130172014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 176.25,
      "completions/mean_terminated_length": 176.25,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.3793416693806648,
      "epoch": 0.2221398795738768,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002750967163592577,
      "kl": 0.002727110870182514,
      "learning_rate": 9.555812876331635e-07,
      "loss": 0.0001,
      "num_tokens": 131820478.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4796,
      "step_time": 22.476028122007847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 232.0625,
      "completions/mean_terminated_length": 232.0625,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.3448811545968056,
      "epoch": 0.2221861973135711,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09937860816717148,
      "kl": 0.014102430315688252,
      "learning_rate": 9.555720240852246e-07,
      "loss": -0.0193,
      "num_tokens": 131850783.0,
      "reward": 0.5833531022071838,
      "reward_std": 0.4670347571372986,
      "rewards/reward_func/mean": 0.5833531022071838,
      "rewards/reward_func/std": 0.4670347571372986,
      "step": 4797,
      "step_time": 25.142017655074596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 158.1875,
      "completions/mean_terminated_length": 158.1875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3435058891773224,
      "epoch": 0.2222325150532654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00758472690358758,
      "kl": 0.006733081070706248,
      "learning_rate": 9.555627605372857e-07,
      "loss": 0.0003,
      "num_tokens": 131872082.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4798,
      "step_time": 16.301632039248943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 124.0,
      "completions/mean_terminated_length": 124.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.20530404523015022,
      "epoch": 0.2222788327929597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004247116856276989,
      "kl": 0.002394238661509007,
      "learning_rate": 9.555534969893468e-07,
      "loss": 0.0001,
      "num_tokens": 131891426.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4799,
      "step_time": 13.246216677129269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 177.375,
      "completions/mean_terminated_length": 177.375,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.24138593301177025,
      "epoch": 0.222325150532654,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015958865405991673,
      "kl": 0.0024760275264270604,
      "learning_rate": 9.55544233441408e-07,
      "loss": 0.0001,
      "num_tokens": 131942264.0,
      "reward": 0.8385766744613647,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8385766744613647,
      "rewards/reward_func/std": 0.0,
      "step": 4800,
      "step_time": 25.961857695132494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 129.75,
      "completions/mean_terminated_length": 129.75,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.22506817057728767,
      "epoch": 0.2223714682723483,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031876340508461,
      "kl": 0.0014770270208828151,
      "learning_rate": 9.55534969893469e-07,
      "loss": 0.0001,
      "num_tokens": 131962740.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4801,
      "step_time": 13.827475391328335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 164.5,
      "completions/mean_terminated_length": 164.5,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.4195042997598648,
      "epoch": 0.22241778601204262,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002598832594230771,
      "kl": 0.0027095703408122063,
      "learning_rate": 9.555257063455302e-07,
      "loss": 0.0001,
      "num_tokens": 132002220.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4802,
      "step_time": 20.75063246116042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 133.1875,
      "completions/mean_terminated_length": 133.1875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.30575740337371826,
      "epoch": 0.22246410375173692,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005229070782661438,
      "kl": 0.0030032155918888748,
      "learning_rate": 9.555164427975916e-07,
      "loss": 0.0002,
      "num_tokens": 132030191.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4803,
      "step_time": 16.250312250107527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.0,
      "completions/max_terminated_length": 232.0,
      "completions/mean_length": 200.5,
      "completions/mean_terminated_length": 200.5,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "entropy": 0.17106206342577934,
      "epoch": 0.22251042149143122,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11654699593782425,
      "kl": 0.00641520885983482,
      "learning_rate": 9.555071792496527e-07,
      "loss": -0.0502,
      "num_tokens": 132058807.0,
      "reward": 0.8629332184791565,
      "reward_std": 0.04850945621728897,
      "rewards/reward_func/mean": 0.8629332184791565,
      "rewards/reward_func/std": 0.048509448766708374,
      "step": 4804,
      "step_time": 21.568435087800026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 176.8125,
      "completions/mean_terminated_length": 176.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.2175315022468567,
      "epoch": 0.2225567392311255,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15883836150169373,
      "kl": 0.03638976812362671,
      "learning_rate": 9.554979157017136e-07,
      "loss": -0.0371,
      "num_tokens": 132088628.0,
      "reward": 0.7926719784736633,
      "reward_std": 0.29351553320884705,
      "rewards/reward_func/mean": 0.7926719784736633,
      "rewards/reward_func/std": 0.29351553320884705,
      "step": 4805,
      "step_time": 19.971502542495728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 179.5,
      "completions/mean_terminated_length": 179.5,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.15308186411857605,
      "epoch": 0.22260305697081983,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004143744707107544,
      "kl": 0.0025578418280929327,
      "learning_rate": 9.55488652153775e-07,
      "loss": 0.0001,
      "num_tokens": 132110332.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 4806,
      "step_time": 17.15120692551136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 152.0,
      "completions/mean_terminated_length": 152.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.14756058901548386,
      "epoch": 0.22264937471051413,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2407907247543335,
      "kl": 0.011545327957719564,
      "learning_rate": 9.55479388605836e-07,
      "loss": 0.0067,
      "num_tokens": 132141404.0,
      "reward": 0.9204819202423096,
      "reward_std": 0.03945203125476837,
      "rewards/reward_func/mean": 0.9204819202423096,
      "rewards/reward_func/std": 0.03945203125476837,
      "step": 4807,
      "step_time": 17.25317219272256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 243.0,
      "completions/max_terminated_length": 243.0,
      "completions/mean_length": 189.0,
      "completions/mean_terminated_length": 189.0,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.46452558040618896,
      "epoch": 0.22269569245020843,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008620009757578373,
      "kl": 0.007838520454242826,
      "learning_rate": 9.554701250578972e-07,
      "loss": 0.0004,
      "num_tokens": 132163964.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4808,
      "step_time": 20.910505171865225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 224.0,
      "completions/max_terminated_length": 224.0,
      "completions/mean_length": 211.125,
      "completions/mean_terminated_length": 211.125,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "entropy": 0.20958810299634933,
      "epoch": 0.22274201018990272,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002280582906678319,
      "kl": 0.002195667417254299,
      "learning_rate": 9.554608615099583e-07,
      "loss": 0.0001,
      "num_tokens": 132201678.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4809,
      "step_time": 23.08414401486516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 140.8125,
      "completions/mean_terminated_length": 140.8125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.3175320103764534,
      "epoch": 0.22278832792959705,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002707999898120761,
      "kl": 0.0018491519440431148,
      "learning_rate": 9.554515979620194e-07,
      "loss": 0.0001,
      "num_tokens": 132237851.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4810,
      "step_time": 18.741000294685364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 142.3125,
      "completions/mean_terminated_length": 142.3125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.3955921605229378,
      "epoch": 0.22283464566929134,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004006635397672653,
      "kl": 0.0033857455127872527,
      "learning_rate": 9.554423344140806e-07,
      "loss": 0.0002,
      "num_tokens": 132268016.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4811,
      "step_time": 17.484014619141817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 120.375,
      "completions/mean_terminated_length": 120.375,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.352799691259861,
      "epoch": 0.22288096340898564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004791226238012314,
      "kl": 0.0028124931850470603,
      "learning_rate": 9.554330708661417e-07,
      "loss": 0.0001,
      "num_tokens": 132289558.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4812,
      "step_time": 13.757609866559505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 180.5625,
      "completions/mean_terminated_length": 180.5625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.25405431538820267,
      "epoch": 0.22292728114867993,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13583828508853912,
      "kl": 0.02078463975340128,
      "learning_rate": 9.554238073182028e-07,
      "loss": -0.0579,
      "num_tokens": 132310479.0,
      "reward": 0.6409821510314941,
      "reward_std": 0.2502138018608093,
      "rewards/reward_func/mean": 0.6409821510314941,
      "rewards/reward_func/std": 0.2502138018608093,
      "step": 4813,
      "step_time": 20.335052020847797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 125.0,
      "completions/mean_terminated_length": 125.0,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2138473130762577,
      "epoch": 0.22297359888837426,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002683066064491868,
      "kl": 0.0016082872316474095,
      "learning_rate": 9.55414543770264e-07,
      "loss": 0.0001,
      "num_tokens": 132332031.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4814,
      "step_time": 14.282362803816795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 159.1875,
      "completions/mean_terminated_length": 159.1875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.18920167535543442,
      "epoch": 0.22301991662806855,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002966281259432435,
      "kl": 0.001978725107619539,
      "learning_rate": 9.55405280222325e-07,
      "loss": 0.0001,
      "num_tokens": 132368738.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4815,
      "step_time": 21.076286014169455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 181.0,
      "completions/max_terminated_length": 181.0,
      "completions/mean_length": 151.4375,
      "completions/mean_terminated_length": 151.4375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.23978042230010033,
      "epoch": 0.22306623436776285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18582899868488312,
      "kl": 0.021093164570629597,
      "learning_rate": 9.553960166743864e-07,
      "loss": 0.0407,
      "num_tokens": 132389689.0,
      "reward": 0.8833723068237305,
      "reward_std": 0.005832654424011707,
      "rewards/reward_func/mean": 0.8833723068237305,
      "rewards/reward_func/std": 0.005832654424011707,
      "step": 4816,
      "step_time": 16.47979226708412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 150.125,
      "completions/mean_terminated_length": 150.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.36553626507520676,
      "epoch": 0.22311255210745715,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026367190293967724,
      "kl": 0.0022281501442193985,
      "learning_rate": 9.553867531264473e-07,
      "loss": 0.0001,
      "num_tokens": 132422779.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4817,
      "step_time": 20.063343908637762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.0,
      "completions/max_terminated_length": 142.0,
      "completions/mean_length": 125.625,
      "completions/mean_terminated_length": 125.625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.3270419165492058,
      "epoch": 0.22315886984715147,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008946054615080357,
      "kl": 0.005291900597512722,
      "learning_rate": 9.553774895785084e-07,
      "loss": 0.0003,
      "num_tokens": 132444917.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4818,
      "step_time": 14.193899523466825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 159.0625,
      "completions/mean_terminated_length": 159.0625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.2129964791238308,
      "epoch": 0.22320518758684577,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13394396007061005,
      "kl": 0.009990689344704151,
      "learning_rate": 9.553682260305696e-07,
      "loss": 0.0285,
      "num_tokens": 132468982.0,
      "reward": 0.9821569919586182,
      "reward_std": 0.03836126998066902,
      "rewards/reward_func/mean": 0.9821569919586182,
      "rewards/reward_func/std": 0.038361258804798126,
      "step": 4819,
      "step_time": 18.739660866558552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 129.0625,
      "completions/mean_terminated_length": 129.0625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.28548502177000046,
      "epoch": 0.22325150532654006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007504717912524939,
      "kl": 0.0035440283245407045,
      "learning_rate": 9.55358962482631e-07,
      "loss": 0.0002,
      "num_tokens": 132490951.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4820,
      "step_time": 13.963666781783104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 134.25,
      "completions/mean_terminated_length": 134.25,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.2578406110405922,
      "epoch": 0.22329782306623436,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008697756566107273,
      "kl": 0.0033026170858647674,
      "learning_rate": 9.55349698934692e-07,
      "loss": 0.0002,
      "num_tokens": 132512747.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4821,
      "step_time": 14.992974765598774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 131.6875,
      "completions/mean_terminated_length": 131.6875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.29801688343286514,
      "epoch": 0.22334414080592868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0046247392892837524,
      "kl": 0.003248437773436308,
      "learning_rate": 9.553404353867531e-07,
      "loss": 0.0002,
      "num_tokens": 132532598.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4822,
      "step_time": 15.101666435599327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 141.9375,
      "completions/mean_terminated_length": 141.9375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.2898029014468193,
      "epoch": 0.22339045854562298,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034714434295892715,
      "kl": 0.0023183961457107216,
      "learning_rate": 9.553311718388143e-07,
      "loss": 0.0001,
      "num_tokens": 132558277.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4823,
      "step_time": 16.132473524659872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 153.0,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 125.0,
      "completions/mean_terminated_length": 125.0,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "entropy": 0.27723686397075653,
      "epoch": 0.22343677628531727,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002960590645670891,
      "kl": 0.002162082295399159,
      "learning_rate": 9.553219082908754e-07,
      "loss": 0.0001,
      "num_tokens": 132578789.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4824,
      "step_time": 14.366881299763918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 155.9375,
      "completions/mean_terminated_length": 155.9375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.445813313126564,
      "epoch": 0.22348309402501157,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027629341930150986,
      "kl": 0.0023690882953815162,
      "learning_rate": 9.553126447429365e-07,
      "loss": 0.0001,
      "num_tokens": 132622612.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4825,
      "step_time": 21.507322683930397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 237.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 203.4375,
      "completions/mean_terminated_length": 203.4375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.20725734159350395,
      "epoch": 0.2235294117647059,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005832557566463947,
      "kl": 0.009239424020051956,
      "learning_rate": 9.553033811949976e-07,
      "loss": 0.0005,
      "num_tokens": 132651915.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4826,
      "step_time": 22.097034111618996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 141.875,
      "completions/mean_terminated_length": 141.875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.26871396601200104,
      "epoch": 0.2235757295044002,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1823527067899704,
      "kl": 0.037552046589553356,
      "learning_rate": 9.552941176470588e-07,
      "loss": 0.0016,
      "num_tokens": 132671913.0,
      "reward": 0.0625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.0625,
      "rewards/reward_func/std": 0.25,
      "step": 4827,
      "step_time": 16.322473485022783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.0,
      "completions/max_terminated_length": 138.0,
      "completions/mean_length": 121.5,
      "completions/mean_terminated_length": 121.5,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "entropy": 0.2627522647380829,
      "epoch": 0.22362204724409449,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035464696120470762,
      "kl": 0.0020022848329972476,
      "learning_rate": 9.5528485409912e-07,
      "loss": 0.0001,
      "num_tokens": 132692609.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4828,
      "step_time": 13.485189318656921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 209.0,
      "completions/max_terminated_length": 209.0,
      "completions/mean_length": 184.375,
      "completions/mean_terminated_length": 184.375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.218659158796072,
      "epoch": 0.22366836498378878,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11234995722770691,
      "kl": 0.01561578898690641,
      "learning_rate": 9.55275590551181e-07,
      "loss": -0.0512,
      "num_tokens": 132722695.0,
      "reward": 0.4145066738128662,
      "reward_std": 0.293962687253952,
      "rewards/reward_func/mean": 0.4145066738128662,
      "rewards/reward_func/std": 0.293962687253952,
      "step": 4829,
      "step_time": 20.932147346436977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.0,
      "completions/max_terminated_length": 135.0,
      "completions/mean_length": 116.0625,
      "completions/mean_terminated_length": 116.0625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2082267664372921,
      "epoch": 0.2237146827234831,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0050287009216845036,
      "kl": 0.0027412628987804055,
      "learning_rate": 9.552663270032421e-07,
      "loss": 0.0001,
      "num_tokens": 132741960.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4830,
      "step_time": 12.699023351073265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 518.0,
      "completions/max_terminated_length": 518.0,
      "completions/mean_length": 224.3125,
      "completions/mean_terminated_length": 224.3125,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.47856853902339935,
      "epoch": 0.2237610004631774,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15555942058563232,
      "kl": 0.008418293320573866,
      "learning_rate": 9.552570634553033e-07,
      "loss": 0.3276,
      "num_tokens": 132788941.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4831,
      "step_time": 46.43001113459468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 127.25,
      "completions/mean_terminated_length": 127.25,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "entropy": 0.2749846763908863,
      "epoch": 0.2238073182028717,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19707569479942322,
      "kl": 0.013525299145840108,
      "learning_rate": 9.552477999073644e-07,
      "loss": -0.1544,
      "num_tokens": 132809345.0,
      "reward": 0.5,
      "reward_std": 0.5163977742195129,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5163977742195129,
      "step": 4832,
      "step_time": 15.494266454130411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 360.0,
      "completions/max_terminated_length": 360.0,
      "completions/mean_length": 307.5,
      "completions/mean_terminated_length": 307.5,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "entropy": 0.22295811399817467,
      "epoch": 0.223853635942566,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09239844977855682,
      "kl": 0.014077859930694103,
      "learning_rate": 9.552385363594257e-07,
      "loss": 0.0193,
      "num_tokens": 132846745.0,
      "reward": 0.9796926975250244,
      "reward_std": 0.036326806992292404,
      "rewards/reward_func/mean": 0.9796926975250244,
      "rewards/reward_func/std": 0.036326806992292404,
      "step": 4833,
      "step_time": 32.30364686995745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.0,
      "completions/max_terminated_length": 140.0,
      "completions/mean_length": 123.875,
      "completions/mean_terminated_length": 123.875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.28765248507261276,
      "epoch": 0.22389995368226032,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00323854829184711,
      "kl": 0.0023553569335490465,
      "learning_rate": 9.552292728114869e-07,
      "loss": 0.0001,
      "num_tokens": 132867399.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4834,
      "step_time": 13.34771578013897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 182.0,
      "completions/max_terminated_length": 182.0,
      "completions/mean_length": 159.75,
      "completions/mean_terminated_length": 159.75,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2327716276049614,
      "epoch": 0.2239462714219546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007954198867082596,
      "kl": 0.006821100483648479,
      "learning_rate": 9.552200092635478e-07,
      "loss": 0.0003,
      "num_tokens": 132888211.0,
      "reward": 0.7316156029701233,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7316156029701233,
      "rewards/reward_func/std": 0.0,
      "step": 4835,
      "step_time": 16.152273803949356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 154.375,
      "completions/mean_terminated_length": 154.375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.1323045715689659,
      "epoch": 0.2239925891616489,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004046451300382614,
      "kl": 0.04179170448333025,
      "learning_rate": 9.552107457156091e-07,
      "loss": 0.0021,
      "num_tokens": 132912361.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4836,
      "step_time": 16.389708008617163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 122.0,
      "completions/max_terminated_length": 122.0,
      "completions/mean_length": 108.6875,
      "completions/mean_terminated_length": 108.6875,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.23037029802799225,
      "epoch": 0.2240389069013432,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003705455455929041,
      "kl": 0.0016988228308036923,
      "learning_rate": 9.552014821676702e-07,
      "loss": 0.0001,
      "num_tokens": 132932676.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4837,
      "step_time": 12.089770846068859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 181.5,
      "completions/mean_terminated_length": 181.5,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3590536415576935,
      "epoch": 0.22408522464103753,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14399102330207825,
      "kl": 0.019344626693055034,
      "learning_rate": 9.551922186197314e-07,
      "loss": -0.0162,
      "num_tokens": 132969884.0,
      "reward": 0.125,
      "reward_std": 0.34156501293182373,
      "rewards/reward_func/mean": 0.125,
      "rewards/reward_func/std": 0.3415650427341461,
      "step": 4838,
      "step_time": 24.38674383610487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 173.4375,
      "completions/mean_terminated_length": 173.4375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "entropy": 0.17292950302362442,
      "epoch": 0.22413154238073182,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002405304927378893,
      "kl": 0.0020050150342285633,
      "learning_rate": 9.551829550717925e-07,
      "loss": 0.0001,
      "num_tokens": 133003427.0,
      "reward": 0.9167169332504272,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9167169332504272,
      "rewards/reward_func/std": 0.0,
      "step": 4839,
      "step_time": 20.764228850603104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 286.3125,
      "completions/mean_terminated_length": 286.3125,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "entropy": 0.19746238738298416,
      "epoch": 0.22417786012042612,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00419808691367507,
      "kl": 0.0033777932985685766,
      "learning_rate": 9.551736915238536e-07,
      "loss": 0.0002,
      "num_tokens": 133034696.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4840,
      "step_time": 28.575475122779608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 151.1875,
      "completions/mean_terminated_length": 151.1875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.16610990837216377,
      "epoch": 0.22422417786012042,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12614548206329346,
      "kl": 0.056223172694444656,
      "learning_rate": 9.551644279759147e-07,
      "loss": -0.017,
      "num_tokens": 133071227.0,
      "reward": 0.9750921726226807,
      "reward_std": 0.06806114315986633,
      "rewards/reward_func/mean": 0.9750921726226807,
      "rewards/reward_func/std": 0.06806114315986633,
      "step": 4841,
      "step_time": 19.617180079221725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 171.0,
      "completions/mean_terminated_length": 171.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "entropy": 0.35999053716659546,
      "epoch": 0.22427049559981474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02135041542351246,
      "kl": 0.005787533358670771,
      "learning_rate": 9.551551644279759e-07,
      "loss": 0.0003,
      "num_tokens": 133128059.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4842,
      "step_time": 26.984447102993727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 209.0,
      "completions/mean_terminated_length": 209.0,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "entropy": 0.237036794424057,
      "epoch": 0.22431681333950904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010071357246488333,
      "kl": 0.001211098482599482,
      "learning_rate": 9.55145900880037e-07,
      "loss": 0.0001,
      "num_tokens": 133166395.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4843,
      "step_time": 24.326733384281397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 337.0,
      "completions/max_terminated_length": 337.0,
      "completions/mean_length": 292.625,
      "completions/mean_terminated_length": 292.625,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "entropy": 0.4216890260577202,
      "epoch": 0.22436313107920333,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07943207025527954,
      "kl": 0.004926699912175536,
      "learning_rate": 9.55136637332098e-07,
      "loss": 0.0642,
      "num_tokens": 133194901.0,
      "reward": 0.375,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5,
      "step": 4844,
      "step_time": 28.6319671086967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 269.0,
      "completions/max_terminated_length": 269.0,
      "completions/mean_length": 251.8125,
      "completions/mean_terminated_length": 251.8125,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "entropy": 0.17909115552902222,
      "epoch": 0.22440944881889763,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11312639713287354,
      "kl": 0.011401549680158496,
      "learning_rate": 9.551273737841592e-07,
      "loss": 0.0258,
      "num_tokens": 133234210.0,
      "reward": 0.7825256586074829,
      "reward_std": 0.021327367052435875,
      "rewards/reward_func/mean": 0.7825256586074829,
      "rewards/reward_func/std": 0.021327365189790726,
      "step": 4845,
      "step_time": 26.30367613211274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.0,
      "completions/max_terminated_length": 204.0,
      "completions/mean_length": 169.375,
      "completions/mean_terminated_length": 169.375,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.12030527368187904,
      "epoch": 0.22445576655859195,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10640913993120193,
      "kl": 0.003732732468051836,
      "learning_rate": 9.551181102362206e-07,
      "loss": -0.0576,
      "num_tokens": 133259416.0,
      "reward": 0.8599221706390381,
      "reward_std": 0.14197945594787598,
      "rewards/reward_func/mean": 0.8599221706390381,
      "rewards/reward_func/std": 0.14197945594787598,
      "step": 4846,
      "step_time": 18.804424412548542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.0,
      "completions/max_terminated_length": 168.0,
      "completions/mean_length": 143.5625,
      "completions/mean_terminated_length": 143.5625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.27932412177324295,
      "epoch": 0.22450208429828625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031063167843967676,
      "kl": 0.0021145372884348035,
      "learning_rate": 9.551088466882817e-07,
      "loss": 0.0001,
      "num_tokens": 133279201.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4847,
      "step_time": 15.13044085726142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 204.1875,
      "completions/mean_terminated_length": 204.1875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.4376760870218277,
      "epoch": 0.22454840203798054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005663942079991102,
      "kl": 0.0044380699982866645,
      "learning_rate": 9.550995831403426e-07,
      "loss": 0.0002,
      "num_tokens": 133309508.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4848,
      "step_time": 22.924985982477665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 227.0,
      "completions/max_terminated_length": 227.0,
      "completions/mean_length": 178.9375,
      "completions/mean_terminated_length": 178.9375,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3776480481028557,
      "epoch": 0.22459471977767484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010664451867341995,
      "kl": 0.00503740314161405,
      "learning_rate": 9.550903195924037e-07,
      "loss": 0.0002,
      "num_tokens": 133353443.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4849,
      "step_time": 25.17474266514182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 213.4375,
      "completions/mean_terminated_length": 213.4375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "entropy": 0.4941771626472473,
      "epoch": 0.22464103751736916,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11145656555891037,
      "kl": 0.010323689552024007,
      "learning_rate": 9.55081056044465e-07,
      "loss": 0.0053,
      "num_tokens": 133375498.0,
      "reward": 0.6875,
      "reward_std": 0.4787135720252991,
      "rewards/reward_func/mean": 0.6875,
      "rewards/reward_func/std": 0.4787135720252991,
      "step": 4850,
      "step_time": 22.1133300550282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 349.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 282.375,
      "completions/mean_terminated_length": 282.375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "entropy": 0.3258415758609772,
      "epoch": 0.22468735525706346,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08351670205593109,
      "kl": 0.016132496297359467,
      "learning_rate": 9.550717924965262e-07,
      "loss": -0.1205,
      "num_tokens": 133404640.0,
      "reward": 0.48923105001449585,
      "reward_std": 0.3325233459472656,
      "rewards/reward_func/mean": 0.48923105001449585,
      "rewards/reward_func/std": 0.3325233459472656,
      "step": 4851,
      "step_time": 29.518002193421125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 187.3125,
      "completions/mean_terminated_length": 187.3125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.4024698808789253,
      "epoch": 0.22473367299675776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01081777922809124,
      "kl": 0.007352304412052035,
      "learning_rate": 9.550625289485873e-07,
      "loss": 0.0004,
      "num_tokens": 133438549.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4852,
      "step_time": 22.758445031940937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 198.375,
      "completions/mean_terminated_length": 198.375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "entropy": 0.29424215108156204,
      "epoch": 0.22477999073645205,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12527966499328613,
      "kl": 0.029027292504906654,
      "learning_rate": 9.550532654006484e-07,
      "loss": -0.0059,
      "num_tokens": 133469387.0,
      "reward": 0.5771113634109497,
      "reward_std": 0.1708800345659256,
      "rewards/reward_func/mean": 0.5771113634109497,
      "rewards/reward_func/std": 0.1708800494670868,
      "step": 4853,
      "step_time": 20.81361961737275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 166.875,
      "completions/mean_terminated_length": 166.875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.36771079152822495,
      "epoch": 0.22482630847614637,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024784598499536514,
      "kl": 0.0021381523984018713,
      "learning_rate": 9.550440018527096e-07,
      "loss": 0.0001,
      "num_tokens": 133499433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4854,
      "step_time": 19.139034681022167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 171.1875,
      "completions/mean_terminated_length": 171.1875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.3991065099835396,
      "epoch": 0.22487262621584067,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004687669686973095,
      "kl": 0.004851402132771909,
      "learning_rate": 9.550347383047707e-07,
      "loss": 0.0002,
      "num_tokens": 133519964.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4855,
      "step_time": 18.17878869175911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 160.9375,
      "completions/mean_terminated_length": 160.9375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.29705192148685455,
      "epoch": 0.22491894395553497,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16597846150398254,
      "kl": 0.02548988931812346,
      "learning_rate": 9.550254747568318e-07,
      "loss": 0.0202,
      "num_tokens": 133542331.0,
      "reward": 0.7965884804725647,
      "reward_std": 0.31098392605781555,
      "rewards/reward_func/mean": 0.7965884804725647,
      "rewards/reward_func/std": 0.31098392605781555,
      "step": 4856,
      "step_time": 18.238618656992912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 180.1875,
      "completions/mean_terminated_length": 180.1875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.4650045931339264,
      "epoch": 0.22496526169522926,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01402257476001978,
      "kl": 0.011438263114541769,
      "learning_rate": 9.55016211208893e-07,
      "loss": 0.0006,
      "num_tokens": 133567422.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4857,
      "step_time": 20.366154816001654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 159.3125,
      "completions/mean_terminated_length": 159.3125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.3271801993250847,
      "epoch": 0.2250115794349236,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021235935389995575,
      "kl": 0.009877411066554487,
      "learning_rate": 9.55006947660954e-07,
      "loss": 0.0005,
      "num_tokens": 133592803.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4858,
      "step_time": 17.569607835263014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 162.5,
      "completions/mean_terminated_length": 162.5,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.2190491333603859,
      "epoch": 0.22505789717461788,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029440466314554214,
      "kl": 0.0017971152847167104,
      "learning_rate": 9.549976841130152e-07,
      "loss": 0.0001,
      "num_tokens": 133613083.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4859,
      "step_time": 16.454703859984875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 192.1875,
      "completions/mean_terminated_length": 192.1875,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.38992976397275925,
      "epoch": 0.22510421491431218,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10814333707094193,
      "kl": 0.007516085519455373,
      "learning_rate": 9.549884205650763e-07,
      "loss": -0.0683,
      "num_tokens": 133642494.0,
      "reward": 0.05706879496574402,
      "reward_std": 0.22827517986297607,
      "rewards/reward_func/mean": 0.05706879496574402,
      "rewards/reward_func/std": 0.22827517986297607,
      "step": 4860,
      "step_time": 22.814553260803223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 160.1875,
      "completions/mean_terminated_length": 160.1875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "entropy": 0.2716098949313164,
      "epoch": 0.22515053265400647,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005426190327852964,
      "kl": 0.002436083974316716,
      "learning_rate": 9.549791570171374e-07,
      "loss": 0.0001,
      "num_tokens": 133663489.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4861,
      "step_time": 17.264394018799067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 168.6875,
      "completions/mean_terminated_length": 168.6875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.43288785219192505,
      "epoch": 0.2251968503937008,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003112137783318758,
      "kl": 0.0028312166105024517,
      "learning_rate": 9.549698934691986e-07,
      "loss": 0.0001,
      "num_tokens": 133716876.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4862,
      "step_time": 26.97471345961094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 185.0,
      "completions/mean_length": 166.625,
      "completions/mean_terminated_length": 166.625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.20235447958111763,
      "epoch": 0.2252431681333951,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14627960324287415,
      "kl": 0.04060222953557968,
      "learning_rate": 9.5496062992126e-07,
      "loss": 0.0097,
      "num_tokens": 133748438.0,
      "reward": 0.9768627882003784,
      "reward_std": 0.035443443804979324,
      "rewards/reward_func/mean": 0.9768627882003784,
      "rewards/reward_func/std": 0.03544343635439873,
      "step": 4863,
      "step_time": 19.009066738188267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 195.3125,
      "completions/mean_terminated_length": 195.3125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.4221041575074196,
      "epoch": 0.2252894858730894,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003028387203812599,
      "kl": 0.0028028357191942632,
      "learning_rate": 9.54951366373321e-07,
      "loss": 0.0001,
      "num_tokens": 133784283.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4864,
      "step_time": 23.230973970144987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 171.1875,
      "completions/mean_terminated_length": 171.1875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.19607597962021828,
      "epoch": 0.22533580361278369,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0043772365897893906,
      "kl": 0.01675504120066762,
      "learning_rate": 9.549421028253822e-07,
      "loss": 0.0008,
      "num_tokens": 133809422.0,
      "reward": 0.22842517495155334,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.22842517495155334,
      "rewards/reward_func/std": 0.0,
      "step": 4865,
      "step_time": 17.712210282683372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 157.0,
      "completions/max_terminated_length": 157.0,
      "completions/mean_length": 122.8125,
      "completions/mean_terminated_length": 122.8125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.272648885846138,
      "epoch": 0.225382121352478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002636377001181245,
      "kl": 0.0020418757922016084,
      "learning_rate": 9.54932839277443e-07,
      "loss": 0.0001,
      "num_tokens": 133830955.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4866,
      "step_time": 14.760128911584616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 222.6875,
      "completions/mean_terminated_length": 222.6875,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "entropy": 0.3675032928586006,
      "epoch": 0.2254284390921723,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08764305710792542,
      "kl": 0.02246140781790018,
      "learning_rate": 9.549235757295044e-07,
      "loss": -0.0545,
      "num_tokens": 133859830.0,
      "reward": 0.1875,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.1875,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4867,
      "step_time": 24.06194742396474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 220.375,
      "completions/mean_terminated_length": 220.375,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "entropy": 0.33972568809986115,
      "epoch": 0.2254747568318666,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11992360651493073,
      "kl": 0.018236295087262988,
      "learning_rate": 9.549143121815655e-07,
      "loss": 0.0123,
      "num_tokens": 133891916.0,
      "reward": 0.4575090706348419,
      "reward_std": 0.4290746748447418,
      "rewards/reward_func/mean": 0.4575090706348419,
      "rewards/reward_func/std": 0.4290747046470642,
      "step": 4868,
      "step_time": 23.201321847736835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 188.4375,
      "completions/mean_terminated_length": 188.4375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "entropy": 0.2368924766778946,
      "epoch": 0.2255210745715609,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07918360084295273,
      "kl": 0.006348276045173407,
      "learning_rate": 9.549050486336267e-07,
      "loss": -0.1143,
      "num_tokens": 133915491.0,
      "reward": 0.2714614272117615,
      "reward_std": 0.06292904913425446,
      "rewards/reward_func/mean": 0.2714614272117615,
      "rewards/reward_func/std": 0.06292904168367386,
      "step": 4869,
      "step_time": 24.9329380877316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 407.0,
      "completions/max_terminated_length": 407.0,
      "completions/mean_length": 291.0,
      "completions/mean_terminated_length": 291.0,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "entropy": 0.3972603529691696,
      "epoch": 0.22556739231125522,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09982740879058838,
      "kl": 0.022194479126483202,
      "learning_rate": 9.548957850856878e-07,
      "loss": -0.0595,
      "num_tokens": 133946435.0,
      "reward": 0.7962749004364014,
      "reward_std": 0.3643563985824585,
      "rewards/reward_func/mean": 0.7962749004364014,
      "rewards/reward_func/std": 0.3643563985824585,
      "step": 4870,
      "step_time": 34.399493522942066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 156.5,
      "completions/mean_terminated_length": 156.5,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3247595801949501,
      "epoch": 0.22561371005094952,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00221113464795053,
      "kl": 0.0019436216680333018,
      "learning_rate": 9.54886521537749e-07,
      "loss": 0.0001,
      "num_tokens": 133984075.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4871,
      "step_time": 20.088803719729185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 130.3125,
      "completions/mean_terminated_length": 130.3125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.32415852695703506,
      "epoch": 0.2256600277906438,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0054161823354661465,
      "kl": 0.002918284444604069,
      "learning_rate": 9.5487725798981e-07,
      "loss": 0.0001,
      "num_tokens": 134006480.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4872,
      "step_time": 15.15906186774373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 159.0,
      "completions/max_terminated_length": 159.0,
      "completions/mean_length": 141.4375,
      "completions/mean_terminated_length": 141.4375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.33338750898838043,
      "epoch": 0.2257063455303381,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0047162859700620174,
      "kl": 0.0036268249386921525,
      "learning_rate": 9.548679944418712e-07,
      "loss": 0.0002,
      "num_tokens": 134030695.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4873,
      "step_time": 15.892490446567535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 245.0,
      "completions/max_terminated_length": 245.0,
      "completions/mean_length": 218.1875,
      "completions/mean_terminated_length": 218.1875,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "entropy": 0.1801813393831253,
      "epoch": 0.22575266327003243,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004071139730513096,
      "kl": 0.0053452952997758985,
      "learning_rate": 9.548587308939323e-07,
      "loss": 0.0003,
      "num_tokens": 134062810.0,
      "reward": 0.33445996046066284,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.33445996046066284,
      "rewards/reward_func/std": 0.0,
      "step": 4874,
      "step_time": 23.980526633560658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 196.0,
      "completions/max_terminated_length": 196.0,
      "completions/mean_length": 148.875,
      "completions/mean_terminated_length": 148.875,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.40750253200531006,
      "epoch": 0.22579898100972673,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0073066893965005875,
      "kl": 0.005031371954828501,
      "learning_rate": 9.548494673459934e-07,
      "loss": 0.0003,
      "num_tokens": 134087144.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4875,
      "step_time": 17.886852901428938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 143.5,
      "completions/mean_terminated_length": 143.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.30172694474458694,
      "epoch": 0.22584529874942103,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0398525670170784,
      "kl": 0.003992043435573578,
      "learning_rate": 9.548402037980547e-07,
      "loss": 0.0002,
      "num_tokens": 134116128.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4876,
      "step_time": 19.714687902480364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 157.0625,
      "completions/mean_terminated_length": 157.0625,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.20624738186597824,
      "epoch": 0.22589161648911532,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010315127670764923,
      "kl": 0.07822940871119499,
      "learning_rate": 9.548309402501159e-07,
      "loss": 0.0039,
      "num_tokens": 134136657.0,
      "reward": 0.7403417825698853,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7403417825698853,
      "rewards/reward_func/std": 0.0,
      "step": 4877,
      "step_time": 17.194463308900595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 137.25,
      "completions/mean_terminated_length": 137.25,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.31360918283462524,
      "epoch": 0.22593793422880964,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00660826126113534,
      "kl": 0.004416961804963648,
      "learning_rate": 9.548216767021768e-07,
      "loss": 0.0002,
      "num_tokens": 134158757.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4878,
      "step_time": 16.34247576072812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 204.625,
      "completions/mean_terminated_length": 204.625,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "entropy": 0.24317792803049088,
      "epoch": 0.22598425196850394,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17206689715385437,
      "kl": 0.02334041357971728,
      "learning_rate": 9.54812413154238e-07,
      "loss": -0.0217,
      "num_tokens": 134182671.0,
      "reward": 0.7352595329284668,
      "reward_std": 0.10705790668725967,
      "rewards/reward_func/mean": 0.7352595329284668,
      "rewards/reward_func/std": 0.10705791413784027,
      "step": 4879,
      "step_time": 19.69738906994462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 133.0,
      "completions/max_terminated_length": 133.0,
      "completions/mean_length": 110.9375,
      "completions/mean_terminated_length": 110.9375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "entropy": 0.2758304551243782,
      "epoch": 0.22603056970819824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022120364010334015,
      "kl": 0.0017564271984156221,
      "learning_rate": 9.548031496062992e-07,
      "loss": 0.0001,
      "num_tokens": 134203342.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4880,
      "step_time": 12.976031545549631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 230.0,
      "completions/max_terminated_length": 230.0,
      "completions/mean_length": 199.5625,
      "completions/mean_terminated_length": 199.5625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "entropy": 0.32215501368045807,
      "epoch": 0.22607688744789253,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1277199685573578,
      "kl": 0.015220458968542516,
      "learning_rate": 9.547938860583604e-07,
      "loss": 0.0089,
      "num_tokens": 134237143.0,
      "reward": 0.9481692314147949,
      "reward_std": 0.001204445492476225,
      "rewards/reward_func/mean": 0.9481692314147949,
      "rewards/reward_func/std": 0.001204445492476225,
      "step": 4881,
      "step_time": 23.702144730836153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 150.5625,
      "completions/mean_terminated_length": 150.5625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.41165612637996674,
      "epoch": 0.22612320518758686,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016905659576877952,
      "kl": 0.0018852135981433094,
      "learning_rate": 9.547846225104215e-07,
      "loss": 0.0001,
      "num_tokens": 134279904.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4882,
      "step_time": 20.901509072631598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 191.0,
      "completions/max_terminated_length": 191.0,
      "completions/mean_length": 161.625,
      "completions/mean_terminated_length": 161.625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.21951115131378174,
      "epoch": 0.22616952292728115,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25192928314208984,
      "kl": 0.0586194870993495,
      "learning_rate": 9.547753589624826e-07,
      "loss": -0.0217,
      "num_tokens": 134300826.0,
      "reward": 0.5538828372955322,
      "reward_std": 0.5049868226051331,
      "rewards/reward_func/mean": 0.5538828372955322,
      "rewards/reward_func/std": 0.5049868226051331,
      "step": 4883,
      "step_time": 16.96540416777134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 171.5625,
      "completions/mean_terminated_length": 171.5625,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "entropy": 0.37507422268390656,
      "epoch": 0.22621584066697545,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004508259706199169,
      "kl": 0.0031737511744722724,
      "learning_rate": 9.547660954145437e-07,
      "loss": 0.0002,
      "num_tokens": 134330643.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4884,
      "step_time": 20.733138252049685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.0,
      "completions/max_terminated_length": 248.0,
      "completions/mean_length": 164.4375,
      "completions/mean_terminated_length": 164.4375,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 0.35645201057195663,
      "epoch": 0.22626215840666974,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18233071267604828,
      "kl": 0.011758289067074656,
      "learning_rate": 9.547568318666049e-07,
      "loss": -0.0929,
      "num_tokens": 134351210.0,
      "reward": 0.05026397109031677,
      "reward_std": 0.2010558843612671,
      "rewards/reward_func/mean": 0.05026397109031677,
      "rewards/reward_func/std": 0.2010558843612671,
      "step": 4885,
      "step_time": 20.797563511878252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 165.4375,
      "completions/mean_terminated_length": 165.4375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "entropy": 0.3204440176486969,
      "epoch": 0.22630847614636407,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004753609653562307,
      "kl": 0.0034997077891603112,
      "learning_rate": 9.54747568318666e-07,
      "loss": 0.0002,
      "num_tokens": 134373169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4886,
      "step_time": 18.538682401180267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 181.6875,
      "completions/mean_terminated_length": 181.6875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "entropy": 0.10731195658445358,
      "epoch": 0.22635479388605836,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4505479335784912,
      "kl": 0.12920604646205902,
      "learning_rate": 9.547383047707271e-07,
      "loss": 0.0289,
      "num_tokens": 134398988.0,
      "reward": 0.9848394989967346,
      "reward_std": 0.06064195558428764,
      "rewards/reward_func/mean": 0.9848394989967346,
      "rewards/reward_func/std": 0.06064195930957794,
      "step": 4887,
      "step_time": 18.65380907431245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 130.6875,
      "completions/mean_terminated_length": 130.6875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.27973808348178864,
      "epoch": 0.22640111162575266,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005234105978161097,
      "kl": 0.0028177001513540745,
      "learning_rate": 9.547290412227882e-07,
      "loss": 0.0001,
      "num_tokens": 134419927.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4888,
      "step_time": 14.913063060492277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 123.0625,
      "completions/mean_terminated_length": 123.0625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.23313026875257492,
      "epoch": 0.22644742936544696,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038439063355326653,
      "kl": 0.0019966693071182817,
      "learning_rate": 9.547197776748494e-07,
      "loss": 0.0001,
      "num_tokens": 134439560.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4889,
      "step_time": 14.961370319128036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 164.6875,
      "completions/mean_terminated_length": 164.6875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3789483979344368,
      "epoch": 0.22649374710514128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033721274230629206,
      "kl": 0.0027932623052038252,
      "learning_rate": 9.547105141269107e-07,
      "loss": 0.0001,
      "num_tokens": 134471923.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4890,
      "step_time": 18.853180646896362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 237.8125,
      "completions/mean_terminated_length": 237.8125,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "entropy": 0.23378483206033707,
      "epoch": 0.22654006484483558,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0055068242363631725,
      "kl": 0.006216542446054518,
      "learning_rate": 9.547012505789716e-07,
      "loss": 0.0003,
      "num_tokens": 134494656.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4891,
      "step_time": 22.65500281378627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 471.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 259.9375,
      "completions/mean_terminated_length": 259.9375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "entropy": 0.3920636996626854,
      "epoch": 0.22658638258452987,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13632963597774506,
      "kl": 0.01857087411917746,
      "learning_rate": 9.546919870310327e-07,
      "loss": -0.2208,
      "num_tokens": 134523295.0,
      "reward": 0.11859824508428574,
      "reward_std": 0.24871979653835297,
      "rewards/reward_func/mean": 0.11859824508428574,
      "rewards/reward_func/std": 0.24871981143951416,
      "step": 4892,
      "step_time": 38.42748509719968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 158.4375,
      "completions/mean_terminated_length": 158.4375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.2201010026037693,
      "epoch": 0.22663270032422417,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1770990639925003,
      "kl": 0.009945785510353744,
      "learning_rate": 9.54682723483094e-07,
      "loss": 0.0005,
      "num_tokens": 134544246.0,
      "reward": 0.9550249576568604,
      "reward_std": 0.040965043008327484,
      "rewards/reward_func/mean": 0.9550249576568604,
      "rewards/reward_func/std": 0.040965043008327484,
      "step": 4893,
      "step_time": 17.550763957202435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 168.0,
      "completions/mean_terminated_length": 168.0,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.4384583607316017,
      "epoch": 0.2266790180639185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027022287249565125,
      "kl": 0.002615723351482302,
      "learning_rate": 9.546734599351552e-07,
      "loss": 0.0001,
      "num_tokens": 134597286.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4894,
      "step_time": 26.55898481607437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 172.0,
      "completions/mean_terminated_length": 172.0,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.19407839700579643,
      "epoch": 0.2267253358036128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021540976595133543,
      "kl": 0.0016915185115067288,
      "learning_rate": 9.546641963872163e-07,
      "loss": 0.0001,
      "num_tokens": 134627958.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4895,
      "step_time": 20.132581286132336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 145.0,
      "completions/max_terminated_length": 145.0,
      "completions/mean_length": 128.8125,
      "completions/mean_terminated_length": 128.8125,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.26934972405433655,
      "epoch": 0.22677165354330708,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006218805443495512,
      "kl": 0.004028201394248754,
      "learning_rate": 9.546549328392774e-07,
      "loss": 0.0002,
      "num_tokens": 134650403.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4896,
      "step_time": 14.688025809824467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 199.375,
      "completions/mean_terminated_length": 199.375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.14547937363386154,
      "epoch": 0.22681797128300138,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035513699986040592,
      "kl": 0.004947292327415198,
      "learning_rate": 9.546456692913386e-07,
      "loss": 0.0002,
      "num_tokens": 134672585.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4897,
      "step_time": 19.39472997561097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 181.3125,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.45307208597660065,
      "epoch": 0.2268642890226957,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023464923724532127,
      "kl": 0.0026051446329802275,
      "learning_rate": 9.546364057433997e-07,
      "loss": 0.0001,
      "num_tokens": 134717230.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4898,
      "step_time": 26.225048411637545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 167.875,
      "completions/mean_terminated_length": 167.875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3557990789413452,
      "epoch": 0.22691060676239,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1502242088317871,
      "kl": 0.014027463272213936,
      "learning_rate": 9.546271421954608e-07,
      "loss": 0.0429,
      "num_tokens": 134738252.0,
      "reward": 0.4478321075439453,
      "reward_std": 0.3582656681537628,
      "rewards/reward_func/mean": 0.4478321075439453,
      "rewards/reward_func/std": 0.3582656681537628,
      "step": 4899,
      "step_time": 18.294767417013645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 165.0,
      "completions/max_terminated_length": 165.0,
      "completions/mean_length": 137.3125,
      "completions/mean_terminated_length": 137.3125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.24131276458501816,
      "epoch": 0.2269569245020843,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026569655165076256,
      "kl": 0.002151376858819276,
      "learning_rate": 9.54617878647522e-07,
      "loss": 0.0001,
      "num_tokens": 134760081.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4900,
      "step_time": 15.34033501893282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 163.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 144.0,
      "completions/mean_terminated_length": 144.0,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.3678324818611145,
      "epoch": 0.2270032422417786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00403813598677516,
      "kl": 0.004269542056135833,
      "learning_rate": 9.54608615099583e-07,
      "loss": 0.0002,
      "num_tokens": 134792385.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4901,
      "step_time": 17.436243526637554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 190.0,
      "completions/max_terminated_length": 190.0,
      "completions/mean_length": 152.0625,
      "completions/mean_terminated_length": 152.0625,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.17099004611372948,
      "epoch": 0.22704955998147291,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.35982704162597656,
      "kl": 0.03482277039438486,
      "learning_rate": 9.545993515516442e-07,
      "loss": -0.0242,
      "num_tokens": 134813042.0,
      "reward": 0.4540383219718933,
      "reward_std": 0.2948898673057556,
      "rewards/reward_func/mean": 0.4540383219718933,
      "rewards/reward_func/std": 0.2948898673057556,
      "step": 4902,
      "step_time": 16.71489003673196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 230.3125,
      "completions/mean_terminated_length": 230.3125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "entropy": 0.1927366815507412,
      "epoch": 0.2270958777211672,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13495691120624542,
      "kl": 0.01703414274379611,
      "learning_rate": 9.545900880037053e-07,
      "loss": -0.1234,
      "num_tokens": 134849895.0,
      "reward": 0.5759466886520386,
      "reward_std": 0.20994091033935547,
      "rewards/reward_func/mean": 0.5759466886520386,
      "rewards/reward_func/std": 0.20994091033935547,
      "step": 4903,
      "step_time": 26.500569820404053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 147.0,
      "completions/max_terminated_length": 147.0,
      "completions/mean_length": 120.1875,
      "completions/mean_terminated_length": 120.1875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "entropy": 0.2903394475579262,
      "epoch": 0.2271421954608615,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010166214779019356,
      "kl": 0.004449092783033848,
      "learning_rate": 9.545808244557664e-07,
      "loss": 0.0002,
      "num_tokens": 134871130.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4904,
      "step_time": 14.358096912503242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 169.625,
      "completions/mean_terminated_length": 169.625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "entropy": 0.35955333709716797,
      "epoch": 0.2271885132005558,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004139995668083429,
      "kl": 0.00286450277781114,
      "learning_rate": 9.545715609078276e-07,
      "loss": 0.0001,
      "num_tokens": 134893124.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4905,
      "step_time": 17.006395772099495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 200.0,
      "completions/max_terminated_length": 200.0,
      "completions/mean_length": 173.6875,
      "completions/mean_terminated_length": 173.6875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "entropy": 0.30060071498155594,
      "epoch": 0.22723483094025013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002366934670135379,
      "kl": 0.001916272594826296,
      "learning_rate": 9.54562297359889e-07,
      "loss": 0.0001,
      "num_tokens": 134917535.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4906,
      "step_time": 18.552723117172718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.0,
      "completions/max_terminated_length": 219.0,
      "completions/mean_length": 182.75,
      "completions/mean_terminated_length": 182.75,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.20699544623494148,
      "epoch": 0.22728114867994442,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019894258584827185,
      "kl": 0.0015113575500436127,
      "learning_rate": 9.5455303381195e-07,
      "loss": 0.0001,
      "num_tokens": 134947003.0,
      "reward": 0.9355069994926453,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9355069994926453,
      "rewards/reward_func/std": 0.0,
      "step": 4907,
      "step_time": 20.712279092520475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 136.8125,
      "completions/mean_terminated_length": 136.8125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "entropy": 0.32988011091947556,
      "epoch": 0.22732746641963872,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013620874844491482,
      "kl": 0.003609262639656663,
      "learning_rate": 9.545437702640112e-07,
      "loss": 0.0002,
      "num_tokens": 134972168.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4908,
      "step_time": 15.561688888818026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 283.0,
      "completions/max_terminated_length": 283.0,
      "completions/mean_length": 250.5,
      "completions/mean_terminated_length": 250.5,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "entropy": 0.24873162060976028,
      "epoch": 0.22737378415933301,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005233952309936285,
      "kl": 0.00603331346064806,
      "learning_rate": 9.54534506716072e-07,
      "loss": 0.0003,
      "num_tokens": 135011504.0,
      "reward": 0.7873122096061707,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7873122096061707,
      "rewards/reward_func/std": 0.0,
      "step": 4909,
      "step_time": 27.324167914688587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "entropy": 0.179918572306633,
      "epoch": 0.22742010189902734,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11866836994886398,
      "kl": 0.02200413029640913,
      "learning_rate": 9.545252431681334e-07,
      "loss": -0.0081,
      "num_tokens": 135034449.0,
      "reward": 0.9923305511474609,
      "reward_std": 0.016488710418343544,
      "rewards/reward_func/mean": 0.9923305511474609,
      "rewards/reward_func/std": 0.016488708555698395,
      "step": 4910,
      "step_time": 17.212855003774166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 189.0,
      "completions/mean_length": 161.125,
      "completions/mean_terminated_length": 161.125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.18011927977204323,
      "epoch": 0.22746641963872163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008853848092257977,
      "kl": 0.0037003319012001157,
      "learning_rate": 9.545159796201945e-07,
      "loss": 0.0002,
      "num_tokens": 135055475.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4911,
      "step_time": 16.66002894937992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 208.0,
      "completions/max_terminated_length": 208.0,
      "completions/mean_length": 185.875,
      "completions/mean_terminated_length": 185.875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.17935670539736748,
      "epoch": 0.22751273737841593,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10368917882442474,
      "kl": 0.005186059512197971,
      "learning_rate": 9.545067160722557e-07,
      "loss": 0.0001,
      "num_tokens": 135092625.0,
      "reward": 0.7021901607513428,
      "reward_std": 0.036369748413562775,
      "rewards/reward_func/mean": 0.7021901607513428,
      "rewards/reward_func/std": 0.03636974096298218,
      "step": 4912,
      "step_time": 21.843701250851154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 162.0,
      "completions/max_terminated_length": 162.0,
      "completions/mean_length": 137.5,
      "completions/mean_terminated_length": 137.5,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.31793226301670074,
      "epoch": 0.22755905511811023,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002297846833243966,
      "kl": 0.0020571271888911724,
      "learning_rate": 9.544974525243168e-07,
      "loss": 0.0001,
      "num_tokens": 135113817.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4913,
      "step_time": 15.05922843515873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 202.0,
      "completions/mean_length": 194.0,
      "completions/mean_terminated_length": 194.0,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.17006618157029152,
      "epoch": 0.22760537285780455,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006562127731740475,
      "kl": 0.005386265926063061,
      "learning_rate": 9.54488188976378e-07,
      "loss": 0.0003,
      "num_tokens": 135138057.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4914,
      "step_time": 18.65556411445141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 162.6875,
      "completions/mean_terminated_length": 162.6875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "entropy": 0.4126560464501381,
      "epoch": 0.22765169059749885,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004423164296895266,
      "kl": 0.002680208534002304,
      "learning_rate": 9.54478925428439e-07,
      "loss": 0.0001,
      "num_tokens": 135177540.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4915,
      "step_time": 20.52718361467123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.0,
      "completions/max_terminated_length": 123.0,
      "completions/mean_length": 108.1875,
      "completions/mean_terminated_length": 108.1875,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 0.32868946343660355,
      "epoch": 0.22769800833719314,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030916209798306227,
      "kl": 0.0024434159859083593,
      "learning_rate": 9.544696618805002e-07,
      "loss": 0.0001,
      "num_tokens": 135201207.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4916,
      "step_time": 13.112210553139448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 218.0,
      "completions/mean_terminated_length": 218.0,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "entropy": 0.48267073929309845,
      "epoch": 0.22774432607688744,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007472446653991938,
      "kl": 0.005911440821364522,
      "learning_rate": 9.544603983325613e-07,
      "loss": 0.0003,
      "num_tokens": 135230311.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4917,
      "step_time": 23.791458789259195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 191.0625,
      "completions/mean_terminated_length": 191.0625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.34144359081983566,
      "epoch": 0.22779064381658176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16799886524677277,
      "kl": 0.022874554619193077,
      "learning_rate": 9.544511347846224e-07,
      "loss": -0.0042,
      "num_tokens": 135251256.0,
      "reward": 0.9706025123596191,
      "reward_std": 0.08568520843982697,
      "rewards/reward_func/mean": 0.9706025123596191,
      "rewards/reward_func/std": 0.08568520843982697,
      "step": 4918,
      "step_time": 19.11159225180745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 125.0,
      "completions/max_terminated_length": 125.0,
      "completions/mean_length": 115.6875,
      "completions/mean_terminated_length": 115.6875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "entropy": 0.2824091613292694,
      "epoch": 0.22783696155627606,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0046332888305187225,
      "kl": 0.0032130761828739196,
      "learning_rate": 9.544418712366835e-07,
      "loss": 0.0002,
      "num_tokens": 135272195.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4919,
      "step_time": 12.575989741832018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 330.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 236.625,
      "completions/mean_terminated_length": 236.625,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.3163532689213753,
      "epoch": 0.22788327929597035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1030849814414978,
      "kl": 0.017714201007038355,
      "learning_rate": 9.544326076887449e-07,
      "loss": -0.0116,
      "num_tokens": 135307485.0,
      "reward": 0.6404792070388794,
      "reward_std": 0.17094388604164124,
      "rewards/reward_func/mean": 0.6404792070388794,
      "rewards/reward_func/std": 0.17094388604164124,
      "step": 4920,
      "step_time": 29.51052325963974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.0,
      "completions/max_terminated_length": 141.0,
      "completions/mean_length": 123.6875,
      "completions/mean_terminated_length": 123.6875,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "entropy": 0.34754548966884613,
      "epoch": 0.22792959703566465,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00761881098151207,
      "kl": 0.004353253520093858,
      "learning_rate": 9.544233441408058e-07,
      "loss": 0.0002,
      "num_tokens": 135330440.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4921,
      "step_time": 13.971477288752794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 211.0,
      "completions/max_terminated_length": 211.0,
      "completions/mean_length": 172.125,
      "completions/mean_terminated_length": 172.125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.36169762909412384,
      "epoch": 0.22797591477535897,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002824055962264538,
      "kl": 0.002495995140634477,
      "learning_rate": 9.54414080592867e-07,
      "loss": 0.0001,
      "num_tokens": 135351450.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4922,
      "step_time": 19.587568532675505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 207.25,
      "completions/mean_terminated_length": 207.25,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.4654117077589035,
      "epoch": 0.22802223251505327,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11375361680984497,
      "kl": 0.0049705225974321365,
      "learning_rate": 9.544048170449282e-07,
      "loss": 0.0723,
      "num_tokens": 135378478.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4923,
      "step_time": 23.855288103222847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 163.125,
      "completions/mean_terminated_length": 163.125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.34130582958459854,
      "epoch": 0.22806855025474757,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010463550686836243,
      "kl": 0.007971058366820216,
      "learning_rate": 9.543955534969894e-07,
      "loss": 0.0004,
      "num_tokens": 135399584.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4924,
      "step_time": 19.53496042266488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 141.0625,
      "completions/mean_terminated_length": 141.0625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.15614399872720242,
      "epoch": 0.22811486799444186,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002858600812032819,
      "kl": 0.003169603645801544,
      "learning_rate": 9.543862899490505e-07,
      "loss": 0.0002,
      "num_tokens": 135420673.0,
      "reward": 0.9200444221496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9200444221496582,
      "rewards/reward_func/std": 0.0,
      "step": 4925,
      "step_time": 15.308533191680908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 392.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 346.0625,
      "completions/mean_terminated_length": 346.0625,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "entropy": 0.13987799733877182,
      "epoch": 0.22816118573413618,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07261361181735992,
      "kl": 0.024310471722856164,
      "learning_rate": 9.543770264011116e-07,
      "loss": 0.0669,
      "num_tokens": 135462450.0,
      "reward": 0.673902153968811,
      "reward_std": 0.044730495661497116,
      "rewards/reward_func/mean": 0.673902153968811,
      "rewards/reward_func/std": 0.04473051056265831,
      "step": 4926,
      "step_time": 35.43450753763318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 164.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 142.125,
      "completions/mean_terminated_length": 142.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "entropy": 0.3148866668343544,
      "epoch": 0.22820750347383048,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025384712498635054,
      "kl": 0.001969317498151213,
      "learning_rate": 9.543677628531727e-07,
      "loss": 0.0001,
      "num_tokens": 135493700.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4927,
      "step_time": 17.43789406493306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 167.25,
      "completions/mean_terminated_length": 167.25,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.17270001769065857,
      "epoch": 0.22825382121352478,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038994576316326857,
      "kl": 0.0033217769814655185,
      "learning_rate": 9.543584993052339e-07,
      "loss": 0.0002,
      "num_tokens": 135529992.0,
      "reward": 0.910879909992218,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.910879909992218,
      "rewards/reward_func/std": 0.0,
      "step": 4928,
      "step_time": 21.56278222426772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.0,
      "completions/max_terminated_length": 146.0,
      "completions/mean_length": 131.3125,
      "completions/mean_terminated_length": 131.3125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.23226439207792282,
      "epoch": 0.22830013895321907,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023610901553183794,
      "kl": 0.0018795774085447192,
      "learning_rate": 9.54349235757295e-07,
      "loss": 0.0001,
      "num_tokens": 135549549.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4929,
      "step_time": 13.779592674225569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 169.0,
      "completions/max_terminated_length": 169.0,
      "completions/mean_length": 147.5,
      "completions/mean_terminated_length": 147.5,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "entropy": 0.3198471963405609,
      "epoch": 0.2283464566929134,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0041888924315571785,
      "kl": 0.0027100183069705963,
      "learning_rate": 9.543399722093561e-07,
      "loss": 0.0001,
      "num_tokens": 135572741.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4930,
      "step_time": 16.231150288134813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 202.0,
      "completions/mean_terminated_length": 202.0,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "entropy": 0.43099071830511093,
      "epoch": 0.2283927744326077,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11954791843891144,
      "kl": 0.009016307070851326,
      "learning_rate": 9.543307086614172e-07,
      "loss": -0.0155,
      "num_tokens": 135596293.0,
      "reward": 0.8848613500595093,
      "reward_std": 0.06865546852350235,
      "rewards/reward_func/mean": 0.8848613500595093,
      "rewards/reward_func/std": 0.06865545362234116,
      "step": 4931,
      "step_time": 23.168150942772627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 209.0625,
      "completions/mean_terminated_length": 209.0625,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.1873588114976883,
      "epoch": 0.228439092172302,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010554197244346142,
      "kl": 0.04245026782155037,
      "learning_rate": 9.543214451134784e-07,
      "loss": 0.0021,
      "num_tokens": 135628486.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4932,
      "step_time": 22.609506770968437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 228.0,
      "completions/max_terminated_length": 228.0,
      "completions/mean_length": 180.5625,
      "completions/mean_terminated_length": 180.5625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.40849726647138596,
      "epoch": 0.22848540991199628,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004541236907243729,
      "kl": 0.003340022871270776,
      "learning_rate": 9.543121815655397e-07,
      "loss": 0.0002,
      "num_tokens": 135686847.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4933,
      "step_time": 29.117306102067232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 144.4375,
      "completions/mean_terminated_length": 144.4375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.2771778255701065,
      "epoch": 0.2285317276516906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009852061048150063,
      "kl": 0.007017065770924091,
      "learning_rate": 9.543029180176006e-07,
      "loss": 0.0004,
      "num_tokens": 135706598.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4934,
      "step_time": 14.810244608670473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 159.6875,
      "completions/mean_terminated_length": 159.6875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "entropy": 0.28022734820842743,
      "epoch": 0.2285780453913849,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0047379145398736,
      "kl": 0.0032934931805357337,
      "learning_rate": 9.542936544696617e-07,
      "loss": 0.0002,
      "num_tokens": 135730769.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4935,
      "step_time": 17.938908737152815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 135.375,
      "completions/mean_terminated_length": 135.375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 0.3413321375846863,
      "epoch": 0.2286243631310792,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003127356292679906,
      "kl": 0.002794792235363275,
      "learning_rate": 9.542843909217229e-07,
      "loss": 0.0001,
      "num_tokens": 135751655.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4936,
      "step_time": 14.597114082425833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 131.5,
      "completions/mean_terminated_length": 131.5,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "entropy": 0.3287753537297249,
      "epoch": 0.2286706808707735,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030219038017094135,
      "kl": 0.0021067450288683176,
      "learning_rate": 9.542751273737842e-07,
      "loss": 0.0001,
      "num_tokens": 135785455.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4937,
      "step_time": 17.11738248169422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 195.0,
      "completions/max_terminated_length": 195.0,
      "completions/mean_length": 174.125,
      "completions/mean_terminated_length": 174.125,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "entropy": 0.1680516116321087,
      "epoch": 0.22871699861046782,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002871938282623887,
      "kl": 0.0023986430023796856,
      "learning_rate": 9.542658638258453e-07,
      "loss": 0.0001,
      "num_tokens": 135823729.0,
      "reward": 0.8702397346496582,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8702397346496582,
      "rewards/reward_func/std": 0.0,
      "step": 4938,
      "step_time": 21.362465284764767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 222.75,
      "completions/mean_terminated_length": 222.75,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "entropy": 0.3788619041442871,
      "epoch": 0.22876331635016212,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10780991613864899,
      "kl": 0.01446099323220551,
      "learning_rate": 9.542566002779065e-07,
      "loss": -0.0269,
      "num_tokens": 135855373.0,
      "reward": 0.034384146332740784,
      "reward_std": 0.035511795431375504,
      "rewards/reward_func/mean": 0.034384146332740784,
      "rewards/reward_func/std": 0.035511795431375504,
      "step": 4939,
      "step_time": 25.42097443714738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 166.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 142.875,
      "completions/mean_terminated_length": 142.875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "entropy": 0.2784198522567749,
      "epoch": 0.2288096340898564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00905576627701521,
      "kl": 0.007316823350265622,
      "learning_rate": 9.542473367299676e-07,
      "loss": 0.0004,
      "num_tokens": 135877403.0,
      "reward": 0.49658530950546265,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.49658530950546265,
      "rewards/reward_func/std": 0.0,
      "step": 4940,
      "step_time": 15.711755707859993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 246.0,
      "completions/max_terminated_length": 246.0,
      "completions/mean_length": 185.5625,
      "completions/mean_terminated_length": 185.5625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.19240785762667656,
      "epoch": 0.2288559518295507,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10340874642133713,
      "kl": 0.010724761057645082,
      "learning_rate": 9.542380731820287e-07,
      "loss": -0.0762,
      "num_tokens": 135900740.0,
      "reward": 0.5074652433395386,
      "reward_std": 0.19226588308811188,
      "rewards/reward_func/mean": 0.5074652433395386,
      "rewards/reward_func/std": 0.19226589798927307,
      "step": 4941,
      "step_time": 21.3562855347991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 128.0625,
      "completions/mean_terminated_length": 128.0625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.21739665046334267,
      "epoch": 0.22890226956924503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00455134455114603,
      "kl": 0.002902865700889379,
      "learning_rate": 9.542288096340898e-07,
      "loss": 0.0001,
      "num_tokens": 135920325.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4942,
      "step_time": 13.519710768014193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 117.0,
      "completions/mean_terminated_length": 117.0,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "entropy": 0.2576649561524391,
      "epoch": 0.22894858730893933,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026868912391364574,
      "kl": 0.0017338355246465653,
      "learning_rate": 9.54219546086151e-07,
      "loss": 0.0001,
      "num_tokens": 135940117.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4943,
      "step_time": 12.438980303704739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 266.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 213.4375,
      "completions/mean_terminated_length": 213.4375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.22489995509386063,
      "epoch": 0.22899490504863362,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003607120830565691,
      "kl": 0.003690137469675392,
      "learning_rate": 9.54210282538212e-07,
      "loss": 0.0002,
      "num_tokens": 135968828.0,
      "reward": 0.3035200238227844,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.3035200238227844,
      "rewards/reward_func/std": 0.0,
      "step": 4944,
      "step_time": 23.624397356063128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 219.125,
      "completions/mean_terminated_length": 219.125,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "entropy": 0.30274175852537155,
      "epoch": 0.22904122278832792,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09864536672830582,
      "kl": 0.035230671521276236,
      "learning_rate": 9.542010189902732e-07,
      "loss": -0.1351,
      "num_tokens": 135992430.0,
      "reward": 0.37219002842903137,
      "reward_std": 0.43586432933807373,
      "rewards/reward_func/mean": 0.37219002842903137,
      "rewards/reward_func/std": 0.43586432933807373,
      "step": 4945,
      "step_time": 24.21020607277751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 207.0,
      "completions/max_terminated_length": 207.0,
      "completions/mean_length": 170.5625,
      "completions/mean_terminated_length": 170.5625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "entropy": 0.28714311867952347,
      "epoch": 0.22908754052802224,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1121155172586441,
      "kl": 0.0127812679274939,
      "learning_rate": 9.541917554423343e-07,
      "loss": -0.0098,
      "num_tokens": 136019207.0,
      "reward": 0.8189885020256042,
      "reward_std": 0.32016006112098694,
      "rewards/reward_func/mean": 0.8189885020256042,
      "rewards/reward_func/std": 0.3201601207256317,
      "step": 4946,
      "step_time": 19.498099893331528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 185.1875,
      "completions/mean_terminated_length": 185.1875,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "entropy": 0.19009604305028915,
      "epoch": 0.22913385826771654,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12893837690353394,
      "kl": 0.008450278546661139,
      "learning_rate": 9.541824918943955e-07,
      "loss": -0.0237,
      "num_tokens": 136060538.0,
      "reward": 0.9297850131988525,
      "reward_std": 0.06395463645458221,
      "rewards/reward_func/mean": 0.9297850131988525,
      "rewards/reward_func/std": 0.06395463645458221,
      "step": 4947,
      "step_time": 22.743079613894224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.0,
      "completions/max_terminated_length": 201.0,
      "completions/mean_length": 165.1875,
      "completions/mean_terminated_length": 165.1875,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.19660202413797379,
      "epoch": 0.22918017600741084,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003081459319218993,
      "kl": 0.002807235694490373,
      "learning_rate": 9.541732283464566e-07,
      "loss": 0.0001,
      "num_tokens": 136097453.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4948,
      "step_time": 21.365657705813646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 203.0,
      "completions/max_terminated_length": 203.0,
      "completions/mean_length": 171.875,
      "completions/mean_terminated_length": 171.875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "entropy": 0.3890600800514221,
      "epoch": 0.22922649374710513,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002776462584733963,
      "kl": 0.003068011370487511,
      "learning_rate": 9.541639647985177e-07,
      "loss": 0.0002,
      "num_tokens": 136128763.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4949,
      "step_time": 20.01358639076352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 321.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 261.375,
      "completions/mean_terminated_length": 261.375,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "entropy": 0.4045885503292084,
      "epoch": 0.22927281148679945,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09543655812740326,
      "kl": 0.027542250230908394,
      "learning_rate": 9.54154701250579e-07,
      "loss": -0.0888,
      "num_tokens": 136160545.0,
      "reward": 0.597649872303009,
      "reward_std": 0.4781515896320343,
      "rewards/reward_func/mean": 0.597649872303009,
      "rewards/reward_func/std": 0.4781516194343567,
      "step": 4950,
      "step_time": 29.6665663048625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 183.0625,
      "completions/mean_terminated_length": 183.0625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "entropy": 0.22729328647255898,
      "epoch": 0.22931912922649375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12908326089382172,
      "kl": 0.04984285309910774,
      "learning_rate": 9.541454377026402e-07,
      "loss": -0.0778,
      "num_tokens": 136197266.0,
      "reward": 0.4950423836708069,
      "reward_std": 0.35888388752937317,
      "rewards/reward_func/mean": 0.4950423836708069,
      "rewards/reward_func/std": 0.35888388752937317,
      "step": 4951,
      "step_time": 23.91749533265829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 188.0,
      "completions/max_terminated_length": 188.0,
      "completions/mean_length": 164.625,
      "completions/mean_terminated_length": 164.625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.17654425278306007,
      "epoch": 0.22936544696618805,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0059771547093987465,
      "kl": 0.03696775436401367,
      "learning_rate": 9.54136174154701e-07,
      "loss": 0.0018,
      "num_tokens": 136219148.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4952,
      "step_time": 17.012957394123077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 178.625,
      "completions/mean_terminated_length": 178.625,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "entropy": 0.37332598865032196,
      "epoch": 0.22941176470588234,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1353083699941635,
      "kl": 0.03166711889207363,
      "learning_rate": 9.541269106067624e-07,
      "loss": -0.061,
      "num_tokens": 136271302.0,
      "reward": 0.13612115383148193,
      "reward_std": 0.2926517128944397,
      "rewards/reward_func/mean": 0.13612115383148193,
      "rewards/reward_func/std": 0.2926517128944397,
      "step": 4953,
      "step_time": 29.59253205731511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.0,
      "completions/max_terminated_length": 265.0,
      "completions/mean_length": 232.3125,
      "completions/mean_terminated_length": 232.3125,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "entropy": 0.27667227387428284,
      "epoch": 0.22945808244557667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0070470962673425674,
      "kl": 0.008905272232368588,
      "learning_rate": 9.541176470588235e-07,
      "loss": 0.0004,
      "num_tokens": 136307147.0,
      "reward": 0.7378081679344177,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.7378081679344177,
      "rewards/reward_func/std": 0.0,
      "step": 4954,
      "step_time": 26.62858249619603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 158.3125,
      "completions/mean_terminated_length": 158.3125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "entropy": 0.3905433714389801,
      "epoch": 0.22950440018527096,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013934897258877754,
      "kl": 0.014455066993832588,
      "learning_rate": 9.541083835108847e-07,
      "loss": 0.0007,
      "num_tokens": 136329488.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4955,
      "step_time": 20.871957015246153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 238.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 197.1875,
      "completions/mean_terminated_length": 197.1875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.40288519859313965,
      "epoch": 0.22955071792496526,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004134077113121748,
      "kl": 0.003971407248172909,
      "learning_rate": 9.540991199629458e-07,
      "loss": 0.0002,
      "num_tokens": 136355107.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4956,
      "step_time": 21.19922338426113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 218.0,
      "completions/max_terminated_length": 218.0,
      "completions/mean_length": 198.0,
      "completions/mean_terminated_length": 198.0,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "entropy": 0.1957165263593197,
      "epoch": 0.22959703566465955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003859918564558029,
      "kl": 0.003108838980551809,
      "learning_rate": 9.54089856415007e-07,
      "loss": 0.0002,
      "num_tokens": 136382419.0,
      "reward": 0.6563555598258972,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.6563555598258972,
      "rewards/reward_func/std": 0.0,
      "step": 4957,
      "step_time": 20.1811338737607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 140.375,
      "completions/mean_terminated_length": 140.375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.1737431362271309,
      "epoch": 0.22964335340435388,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034294351935386658,
      "kl": 0.0024284476530738175,
      "learning_rate": 9.54080592867068e-07,
      "loss": 0.0001,
      "num_tokens": 136403161.0,
      "reward": 0.2865048050880432,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.2865048050880432,
      "rewards/reward_func/std": 0.0,
      "step": 4958,
      "step_time": 16.0653938613832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.0,
      "completions/max_terminated_length": 158.0,
      "completions/mean_length": 137.375,
      "completions/mean_terminated_length": 137.375,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.2510499581694603,
      "epoch": 0.22968967114404817,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020902531687170267,
      "kl": 0.0016211385518545285,
      "learning_rate": 9.540713293191292e-07,
      "loss": 0.0001,
      "num_tokens": 136423807.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4959,
      "step_time": 14.679680604487658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 187.0,
      "completions/max_terminated_length": 187.0,
      "completions/mean_length": 159.875,
      "completions/mean_terminated_length": 159.875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.3182985484600067,
      "epoch": 0.22973598888374247,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015090767294168472,
      "kl": 0.006587028503417969,
      "learning_rate": 9.540620657711903e-07,
      "loss": 0.0003,
      "num_tokens": 136449533.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4960,
      "step_time": 17.82992237433791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.0,
      "completions/max_terminated_length": 174.0,
      "completions/mean_length": 128.1875,
      "completions/mean_terminated_length": 128.1875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3285192921757698,
      "epoch": 0.22978230662343677,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006869843229651451,
      "kl": 0.005431405734270811,
      "learning_rate": 9.540528022232514e-07,
      "loss": 0.0003,
      "num_tokens": 136485856.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4961,
      "step_time": 20.039498902857304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 126.6875,
      "completions/mean_terminated_length": 126.6875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.326532818377018,
      "epoch": 0.2298286243631311,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002713272115215659,
      "kl": 0.0021965482737869024,
      "learning_rate": 9.540435386753125e-07,
      "loss": 0.0001,
      "num_tokens": 136510251.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4962,
      "step_time": 15.315373342484236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.0,
      "completions/max_terminated_length": 180.0,
      "completions/mean_length": 146.125,
      "completions/mean_terminated_length": 146.125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "entropy": 0.33709999918937683,
      "epoch": 0.22987494210282539,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003737780964002013,
      "kl": 0.0025583981187082827,
      "learning_rate": 9.540342751273739e-07,
      "loss": 0.0001,
      "num_tokens": 136533021.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4963,
      "step_time": 16.635175190865993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 303.0,
      "completions/max_terminated_length": 303.0,
      "completions/mean_length": 245.4375,
      "completions/mean_terminated_length": 245.4375,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "entropy": 0.19287850707769394,
      "epoch": 0.22992125984251968,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09211494028568268,
      "kl": 0.009767316281795502,
      "learning_rate": 9.54025011579435e-07,
      "loss": -0.0335,
      "num_tokens": 136557172.0,
      "reward": 0.302733838558197,
      "reward_std": 0.08072902262210846,
      "rewards/reward_func/mean": 0.302733838558197,
      "rewards/reward_func/std": 0.08072902262210846,
      "step": 4964,
      "step_time": 25.37386042997241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 152.0,
      "completions/max_terminated_length": 152.0,
      "completions/mean_length": 133.5,
      "completions/mean_terminated_length": 133.5,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "entropy": 0.3527248054742813,
      "epoch": 0.22996757758221398,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003740234998986125,
      "kl": 0.0025834679254330695,
      "learning_rate": 9.54015748031496e-07,
      "loss": 0.0001,
      "num_tokens": 136577900.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4965,
      "step_time": 14.393873788416386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 210.0,
      "completions/max_terminated_length": 210.0,
      "completions/mean_length": 178.3125,
      "completions/mean_terminated_length": 178.3125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "entropy": 0.4117540717124939,
      "epoch": 0.2300138953219083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008569171652197838,
      "kl": 0.008226197911426425,
      "learning_rate": 9.54006484483557e-07,
      "loss": 0.0004,
      "num_tokens": 136602977.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4966,
      "step_time": 18.987396009266376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 167.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 141.9375,
      "completions/mean_terminated_length": 141.9375,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.32656220346689224,
      "epoch": 0.2300602130616026,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007840272039175034,
      "kl": 0.0038428864208981395,
      "learning_rate": 9.539972209356184e-07,
      "loss": 0.0002,
      "num_tokens": 136626272.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4967,
      "step_time": 15.917999155819416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 178.0,
      "completions/max_terminated_length": 178.0,
      "completions/mean_length": 147.25,
      "completions/mean_terminated_length": 147.25,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "entropy": 0.29315244406461716,
      "epoch": 0.2301065308012969,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031389619689434767,
      "kl": 0.002125636616256088,
      "learning_rate": 9.539879573876795e-07,
      "loss": 0.0001,
      "num_tokens": 136649220.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4968,
      "step_time": 17.14676835387945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 206.0,
      "completions/max_terminated_length": 206.0,
      "completions/mean_length": 159.625,
      "completions/mean_terminated_length": 159.625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "entropy": 0.21956472471356392,
      "epoch": 0.2301528485409912,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33373013138771057,
      "kl": 0.030344389146193862,
      "learning_rate": 9.539786938397406e-07,
      "loss": -0.0922,
      "num_tokens": 136682702.0,
      "reward": 0.5341726541519165,
      "reward_std": 0.24100929498672485,
      "rewards/reward_func/mean": 0.5341726541519165,
      "rewards/reward_func/std": 0.24100928008556366,
      "step": 4969,
      "step_time": 20.75757497921586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 232.8125,
      "completions/mean_terminated_length": 232.8125,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "entropy": 0.47347675263881683,
      "epoch": 0.2301991662806855,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10812296718358994,
      "kl": 0.005712226149626076,
      "learning_rate": 9.539694302918017e-07,
      "loss": 0.0767,
      "num_tokens": 136710075.0,
      "reward": 0.9375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.9375,
      "rewards/reward_func/std": 0.25,
      "step": 4970,
      "step_time": 25.8978082947433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 149.0,
      "completions/max_terminated_length": 149.0,
      "completions/mean_length": 133.0,
      "completions/mean_terminated_length": 133.0,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "entropy": 0.13757196068763733,
      "epoch": 0.2302454840203798,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2606033682823181,
      "kl": 0.05828773230314255,
      "learning_rate": 9.539601667438629e-07,
      "loss": 0.0083,
      "num_tokens": 136730731.0,
      "reward": 0.9269284009933472,
      "reward_std": 0.023441297933459282,
      "rewards/reward_func/mean": 0.9269284009933472,
      "rewards/reward_func/std": 0.023441297933459282,
      "step": 4971,
      "step_time": 14.104417875409126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 184.9375,
      "completions/mean_terminated_length": 184.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.4354928955435753,
      "epoch": 0.2302918017600741,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007435917388647795,
      "kl": 0.0059344901237636805,
      "learning_rate": 9.53950903195924e-07,
      "loss": 0.0003,
      "num_tokens": 136755914.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4972,
      "step_time": 20.306267257779837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 173.875,
      "completions/mean_terminated_length": 173.875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.35089272260665894,
      "epoch": 0.2303381194997684,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01030554249882698,
      "kl": 0.011292512994259596,
      "learning_rate": 9.539416396479851e-07,
      "loss": 0.0006,
      "num_tokens": 136778376.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4973,
      "step_time": 18.960453048348427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 205.0,
      "completions/max_terminated_length": 205.0,
      "completions/mean_length": 173.9375,
      "completions/mean_terminated_length": 173.9375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "entropy": 0.2830249294638634,
      "epoch": 0.23038443723946272,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13870753347873688,
      "kl": 0.012759183533489704,
      "learning_rate": 9.539323761000462e-07,
      "loss": 0.0427,
      "num_tokens": 136799639.0,
      "reward": 0.8004477024078369,
      "reward_std": 0.3188842236995697,
      "rewards/reward_func/mean": 0.8004477024078369,
      "rewards/reward_func/std": 0.3188842535018921,
      "step": 4974,
      "step_time": 19.32561694830656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.0,
      "completions/max_terminated_length": 173.0,
      "completions/mean_length": 123.5625,
      "completions/mean_terminated_length": 123.5625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "entropy": 0.3059925436973572,
      "epoch": 0.23043075497915702,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004238604102283716,
      "kl": 0.00254950430826284,
      "learning_rate": 9.539231125521074e-07,
      "loss": 0.0001,
      "num_tokens": 136821072.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4975,
      "step_time": 15.8724498860538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 204.9375,
      "completions/mean_terminated_length": 204.9375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "entropy": 0.3470360338687897,
      "epoch": 0.23047707271885132,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11621661484241486,
      "kl": 0.005828688619658351,
      "learning_rate": 9.539138490041687e-07,
      "loss": -0.0766,
      "num_tokens": 136846847.0,
      "reward": 0.5920567512512207,
      "reward_std": 0.4122554361820221,
      "rewards/reward_func/mean": 0.5920567512512207,
      "rewards/reward_func/std": 0.4122554361820221,
      "step": 4976,
      "step_time": 23.909298427402973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 269.0625,
      "completions/mean_terminated_length": 269.0625,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "entropy": 0.310733363032341,
      "epoch": 0.2305233904585456,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14859719574451447,
      "kl": 0.01813937397673726,
      "learning_rate": 9.539045854562296e-07,
      "loss": -0.0418,
      "num_tokens": 136885728.0,
      "reward": 0.6814736723899841,
      "reward_std": 0.271925151348114,
      "rewards/reward_func/mean": 0.6814736723899841,
      "rewards/reward_func/std": 0.2719251811504364,
      "step": 4977,
      "step_time": 30.854093376547098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.0,
      "completions/max_terminated_length": 144.0,
      "completions/mean_length": 128.0,
      "completions/mean_terminated_length": 128.0,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "entropy": 0.3014540448784828,
      "epoch": 0.23056970819823994,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004114687442779541,
      "kl": 0.0025400363956578076,
      "learning_rate": 9.538953219082907e-07,
      "loss": 0.0001,
      "num_tokens": 136906944.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4978,
      "step_time": 14.466225132346153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 157.0,
      "completions/mean_terminated_length": 157.0,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 0.31905554980039597,
      "epoch": 0.23061602593793423,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009289965033531189,
      "kl": 0.005353609682060778,
      "learning_rate": 9.538860583603519e-07,
      "loss": 0.0003,
      "num_tokens": 136943536.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4979,
      "step_time": 20.306705847382545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 221.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 191.125,
      "completions/mean_terminated_length": 191.125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "entropy": 0.40436309576034546,
      "epoch": 0.23066234367762853,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009574116207659245,
      "kl": 0.008132883347570896,
      "learning_rate": 9.538767948124132e-07,
      "loss": 0.0004,
      "num_tokens": 136969442.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4980,
      "step_time": 20.005169097334146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 154.0,
      "completions/max_terminated_length": 154.0,
      "completions/mean_length": 131.25,
      "completions/mean_terminated_length": 131.25,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "entropy": 0.27942420542240143,
      "epoch": 0.23070866141732282,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002362277591601014,
      "kl": 0.0019525960087776184,
      "learning_rate": 9.538675312644743e-07,
      "loss": 0.0001,
      "num_tokens": 136990118.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4981,
      "step_time": 14.53041435033083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.0,
      "completions/max_terminated_length": 156.0,
      "completions/mean_length": 134.9375,
      "completions/mean_terminated_length": 134.9375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 0.2817884013056755,
      "epoch": 0.23075497915701715,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003836382180452347,
      "kl": 0.0029382259235717356,
      "learning_rate": 9.538582677165355e-07,
      "loss": 0.0001,
      "num_tokens": 137011285.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4982,
      "step_time": 14.541017945855856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 155.0625,
      "completions/mean_terminated_length": 155.0625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "entropy": 0.1965767741203308,
      "epoch": 0.23080129689671144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005989167373627424,
      "kl": 0.0038433902082033455,
      "learning_rate": 9.538490041685966e-07,
      "loss": 0.0002,
      "num_tokens": 137034038.0,
      "reward": 0.9259610772132874,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9259610772132874,
      "rewards/reward_func/std": 0.0,
      "step": 4983,
      "step_time": 17.173334512859583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 175.0,
      "completions/max_terminated_length": 175.0,
      "completions/mean_length": 151.25,
      "completions/mean_terminated_length": 151.25,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "entropy": 0.13582609966397285,
      "epoch": 0.23084761463640574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019398063886910677,
      "kl": 0.0012038782879244536,
      "learning_rate": 9.538397406206577e-07,
      "loss": 0.0001,
      "num_tokens": 137078842.0,
      "reward": 0.8187307715415955,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.8187307715415955,
      "rewards/reward_func/std": 0.0,
      "step": 4984,
      "step_time": 21.77512374892831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 130.0,
      "completions/max_terminated_length": 130.0,
      "completions/mean_length": 118.4375,
      "completions/mean_terminated_length": 118.4375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "entropy": 0.27934522181749344,
      "epoch": 0.23089393237610004,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035192626528441906,
      "kl": 0.0022731263597961515,
      "learning_rate": 9.538304770727188e-07,
      "loss": 0.0001,
      "num_tokens": 137099729.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4985,
      "step_time": 13.443418379873037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 338.0,
      "completions/max_terminated_length": 338.0,
      "completions/mean_length": 307.5,
      "completions/mean_terminated_length": 307.5,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "entropy": 0.18417320027947426,
      "epoch": 0.23094025011579436,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11577610671520233,
      "kl": 0.009522537817247212,
      "learning_rate": 9.5382121352478e-07,
      "loss": 0.0377,
      "num_tokens": 137129561.0,
      "reward": 0.9662111401557922,
      "reward_std": 0.027031106874346733,
      "rewards/reward_func/mean": 0.9662111401557922,
      "rewards/reward_func/std": 0.02703109383583069,
      "step": 4986,
      "step_time": 29.63142754882574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 171.0,
      "completions/max_terminated_length": 171.0,
      "completions/mean_length": 153.125,
      "completions/mean_terminated_length": 153.125,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "entropy": 0.1748918518424034,
      "epoch": 0.23098656785548866,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09933748096227646,
      "kl": 0.002387493441347033,
      "learning_rate": 9.53811949976841e-07,
      "loss": -0.0282,
      "num_tokens": 137158747.0,
      "reward": 0.9167327880859375,
      "reward_std": 0.03250420093536377,
      "rewards/reward_func/mean": 0.9167327880859375,
      "rewards/reward_func/std": 0.03250420466065407,
      "step": 4987,
      "step_time": 18.521799258887768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 231.875,
      "completions/mean_terminated_length": 231.875,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "entropy": 0.32956328243017197,
      "epoch": 0.23103288559518295,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10205793380737305,
      "kl": 0.014923338778316975,
      "learning_rate": 9.538026864289022e-07,
      "loss": -0.0224,
      "num_tokens": 137197113.0,
      "reward": 0.8125,
      "reward_std": 0.40311288833618164,
      "rewards/reward_func/mean": 0.8125,
      "rewards/reward_func/std": 0.40311288833618164,
      "step": 4988,
      "step_time": 25.59490615129471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 160.0,
      "completions/mean_terminated_length": 160.0,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.4404328987002373,
      "epoch": 0.23107920333487725,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031527713872492313,
      "kl": 0.0027730937872547656,
      "learning_rate": 9.537934228809633e-07,
      "loss": 0.0001,
      "num_tokens": 137241481.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4989,
      "step_time": 23.048565436154604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.0,
      "completions/max_terminated_length": 217.0,
      "completions/mean_length": 177.0625,
      "completions/mean_terminated_length": 177.0625,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "entropy": 0.20850744098424911,
      "epoch": 0.23112552107457157,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0055336784571409225,
      "kl": 0.006053523859009147,
      "learning_rate": 9.537841593330245e-07,
      "loss": 0.0003,
      "num_tokens": 137273658.0,
      "reward": 0.9181891679763794,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9181891679763794,
      "rewards/reward_func/std": 0.0,
      "step": 4990,
      "step_time": 21.69393128529191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 132.5625,
      "completions/mean_terminated_length": 132.5625,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.23681363463401794,
      "epoch": 0.23117183881426587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003796979319304228,
      "kl": 0.002893943339586258,
      "learning_rate": 9.537748957850856e-07,
      "loss": 0.0001,
      "num_tokens": 137293923.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4991,
      "step_time": 14.124348815530539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 182.5,
      "completions/mean_terminated_length": 182.5,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "entropy": 0.3799362927675247,
      "epoch": 0.23121815655396016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01965484395623207,
      "kl": 0.019588962895795703,
      "learning_rate": 9.537656322371467e-07,
      "loss": 0.001,
      "num_tokens": 137320299.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 4992,
      "step_time": 24.335781812667847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 193.0,
      "completions/mean_length": 161.4375,
      "completions/mean_terminated_length": 161.4375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.44669947773218155,
      "epoch": 0.23126447429365446,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035574499052017927,
      "kl": 0.002802410745061934,
      "learning_rate": 9.53756368689208e-07,
      "loss": 0.0001,
      "num_tokens": 137350626.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4993,
      "step_time": 19.18287806212902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "entropy": 0.309749573469162,
      "epoch": 0.23131079203334878,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002668954897671938,
      "kl": 0.002336340432520956,
      "learning_rate": 9.537471051412692e-07,
      "loss": 0.0001,
      "num_tokens": 137374081.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4994,
      "step_time": 16.534566815942526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 170.0625,
      "completions/mean_terminated_length": 170.0625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "entropy": 0.2118827849626541,
      "epoch": 0.23135710977304308,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11174722015857697,
      "kl": 0.005126630654558539,
      "learning_rate": 9.5373784159333e-07,
      "loss": 0.0033,
      "num_tokens": 137396066.0,
      "reward": 0.5191009044647217,
      "reward_std": 0.013752984814345837,
      "rewards/reward_func/mean": 0.5191009044647217,
      "rewards/reward_func/std": 0.013752982951700687,
      "step": 4995,
      "step_time": 18.149486247450113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 151.0,
      "completions/max_terminated_length": 151.0,
      "completions/mean_length": 130.0,
      "completions/mean_terminated_length": 130.0,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "entropy": 0.2366933375597,
      "epoch": 0.23140342751273738,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006469685584306717,
      "kl": 0.0036037511890754104,
      "learning_rate": 9.537285780453912e-07,
      "loss": 0.0002,
      "num_tokens": 137417058.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4996,
      "step_time": 14.295692507177591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 369.0,
      "completions/max_terminated_length": 369.0,
      "completions/mean_length": 263.125,
      "completions/mean_terminated_length": 263.125,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "entropy": 0.3441452980041504,
      "epoch": 0.23144974525243167,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12067591398954391,
      "kl": 0.020268420223146677,
      "learning_rate": 9.537193144974525e-07,
      "loss": -0.0449,
      "num_tokens": 137448916.0,
      "reward": 0.014472972601652145,
      "reward_std": 0.05553954839706421,
      "rewards/reward_func/mean": 0.014472972601652145,
      "rewards/reward_func/std": 0.05553954839706421,
      "step": 4997,
      "step_time": 31.663094013929367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 163.25,
      "completions/mean_terminated_length": 163.25,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "entropy": 0.4020567238330841,
      "epoch": 0.231496062992126,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009164032526314259,
      "kl": 0.004412938025780022,
      "learning_rate": 9.537100509495137e-07,
      "loss": 0.0002,
      "num_tokens": 137474872.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4998,
      "step_time": 18.051149625331163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.0,
      "completions/max_terminated_length": 136.0,
      "completions/mean_length": 120.8125,
      "completions/mean_terminated_length": 120.8125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "entropy": 0.27976813167333603,
      "epoch": 0.2315423807318203,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006004109513014555,
      "kl": 0.0033063965383917093,
      "learning_rate": 9.537007874015748e-07,
      "loss": 0.0002,
      "num_tokens": 137495285.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 4999,
      "step_time": 13.659004848450422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 197.0,
      "completions/max_terminated_length": 197.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "entropy": 0.36634664237499237,
      "epoch": 0.2315886984715146,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011201155371963978,
      "kl": 0.011701725656166673,
      "learning_rate": 9.53691523853636e-07,
      "loss": 0.0006,
      "num_tokens": 137521881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 5000,
      "step_time": 18.465140528976917
    }
  ],
  "logging_steps": 1,
  "max_steps": 107950,
  "num_input_tokens_seen": 137521881,
  "num_train_epochs": 5,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}