{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.85546875,
  "eval_steps": 500,
  "global_step": 950,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5980451107025146,
      "epoch": 0.001953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23979568481445312,
      "learning_rate": 5e-06,
      "loss": 0.0,
      "num_tokens": 4704.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.5153923630714417,
      "rewards/reward_func/mean": 0.6000000238418579,
      "rewards/reward_func/std": 0.49902763962745667,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8363598696887493,
      "epoch": 0.00390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15716730058193207,
      "learning_rate": 4.9947368421052636e-06,
      "loss": -0.0,
      "num_tokens": 9408.0,
      "reward": 0.625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7976906187832355,
      "epoch": 0.005859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21684496104717255,
      "learning_rate": 4.989473684210527e-06,
      "loss": -0.0,
      "num_tokens": 13960.0,
      "reward": 0.8697500228881836,
      "reward_std": 0.2605000138282776,
      "rewards/reward_func/mean": 0.8697500228881836,
      "rewards/reward_func/std": 0.3517392575740814,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5398289896547794,
      "epoch": 0.0078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16021442413330078,
      "learning_rate": 4.98421052631579e-06,
      "loss": -0.0,
      "num_tokens": 18624.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4209697637706995,
      "epoch": 0.009765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16803590953350067,
      "learning_rate": 4.978947368421053e-06,
      "loss": 0.0,
      "num_tokens": 23392.0,
      "reward": 0.3655624985694885,
      "reward_std": 0.5249491930007935,
      "rewards/reward_func/mean": 0.3655624985694885,
      "rewards/reward_func/std": 0.5012756586074829,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4214139562100172,
      "epoch": 0.01171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1561802625656128,
      "learning_rate": 4.973684210526316e-06,
      "loss": 0.0,
      "num_tokens": 28168.0,
      "reward": 0.1210624948143959,
      "reward_std": 0.2379765659570694,
      "rewards/reward_func/mean": 0.1210624948143959,
      "rewards/reward_func/std": 0.33739402890205383,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6921200603246689,
      "epoch": 0.013671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18477526307106018,
      "learning_rate": 4.968421052631579e-06,
      "loss": 0.0,
      "num_tokens": 33032.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6270075961947441,
      "epoch": 0.015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19665871560573578,
      "learning_rate": 4.9631578947368426e-06,
      "loss": -0.0,
      "num_tokens": 37720.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8109602108597755,
      "epoch": 0.017578125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1696549654006958,
      "learning_rate": 4.957894736842106e-06,
      "loss": 0.0,
      "num_tokens": 42208.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6766198761761189,
      "epoch": 0.01953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22143109142780304,
      "learning_rate": 4.952631578947369e-06,
      "loss": 0.0,
      "num_tokens": 46688.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7187379151582718,
      "epoch": 0.021484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18868106603622437,
      "learning_rate": 4.947368421052632e-06,
      "loss": 0.0,
      "num_tokens": 51312.0,
      "reward": 0.48899999260902405,
      "reward_std": 0.48899999260902405,
      "rewards/reward_func/mean": 0.48899999260902405,
      "rewards/reward_func/std": 0.5227630138397217,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6034604460000992,
      "epoch": 0.0234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2152688056230545,
      "learning_rate": 4.942105263157895e-06,
      "loss": 0.0,
      "num_tokens": 55928.0,
      "reward": 0.6254249811172485,
      "reward_std": 0.5333645343780518,
      "rewards/reward_func/mean": 0.6254249811172485,
      "rewards/reward_func/std": 0.5111108422279358,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6899241618812084,
      "epoch": 0.025390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1043475940823555,
      "learning_rate": 4.936842105263158e-06,
      "loss": 0.0,
      "num_tokens": 60568.0,
      "reward": 0.12229999899864197,
      "reward_std": 0.24459999799728394,
      "rewards/reward_func/mean": 0.12229999899864197,
      "rewards/reward_func/std": 0.34591662883758545,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5026568742468953,
      "epoch": 0.02734375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13232570886611938,
      "learning_rate": 4.9315789473684215e-06,
      "loss": 0.0,
      "num_tokens": 65352.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/reward_func/mean": 0.0031250000465661287,
      "rewards/reward_func/std": 0.008838835172355175,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6425582990050316,
      "epoch": 0.029296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23602722585201263,
      "learning_rate": 4.926315789473685e-06,
      "loss": -0.0,
      "num_tokens": 70200.0,
      "reward": 0.6197500228881836,
      "reward_std": 0.5353738069534302,
      "rewards/reward_func/mean": 0.6197500228881836,
      "rewards/reward_func/std": 0.5133981704711914,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8550453297793865,
      "epoch": 0.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21525943279266357,
      "learning_rate": 4.921052631578948e-06,
      "loss": 0.0,
      "num_tokens": 74696.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6007186323404312,
      "epoch": 0.033203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2051023691892624,
      "learning_rate": 4.915789473684211e-06,
      "loss": 0.0,
      "num_tokens": 79168.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.2940751314163208,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.46435519494116306,
      "epoch": 0.03515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20061683654785156,
      "learning_rate": 4.910526315789474e-06,
      "loss": -0.0,
      "num_tokens": 83520.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6039901822805405,
      "epoch": 0.037109375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.905263157894737e-06,
      "loss": 0.0,
      "num_tokens": 88480.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.554688960313797,
      "epoch": 0.0390625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.9000000000000005e-06,
      "loss": 0.0,
      "num_tokens": 93464.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6815345846116543,
      "epoch": 0.041015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19471591711044312,
      "learning_rate": 4.894736842105264e-06,
      "loss": 0.0,
      "num_tokens": 98096.0,
      "reward": 0.7252500057220459,
      "reward_std": 0.48400574922561646,
      "rewards/reward_func/mean": 0.7252500057220459,
      "rewards/reward_func/std": 0.4482128620147705,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8857233077287674,
      "epoch": 0.04296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21319618821144104,
      "learning_rate": 4.889473684210527e-06,
      "loss": 0.0,
      "num_tokens": 102792.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.2940751314163208,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8026586882770061,
      "epoch": 0.044921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21624577045440674,
      "learning_rate": 4.88421052631579e-06,
      "loss": 0.0,
      "num_tokens": 107496.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.591262873262167,
      "epoch": 0.046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1991681307554245,
      "learning_rate": 4.878947368421053e-06,
      "loss": -0.0,
      "num_tokens": 111832.0,
      "reward": 0.3828125,
      "reward_std": 0.529944896697998,
      "rewards/reward_func/mean": 0.3828125,
      "rewards/reward_func/std": 0.5115163922309875,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5665613692253828,
      "epoch": 0.048828125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14706794917583466,
      "learning_rate": 4.873684210526316e-06,
      "loss": -0.0,
      "num_tokens": 116824.0,
      "reward": 0.0015625000232830644,
      "reward_std": 0.0031250000465661287,
      "rewards/reward_func/mean": 0.0015625000232830644,
      "rewards/reward_func/std": 0.0044194175861775875,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7586575634777546,
      "epoch": 0.05078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23515093326568604,
      "learning_rate": 4.8684210526315795e-06,
      "loss": -0.0,
      "num_tokens": 121680.0,
      "reward": 0.25,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5237887464463711,
      "epoch": 0.052734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.209740549325943,
      "learning_rate": 4.863157894736843e-06,
      "loss": -0.0,
      "num_tokens": 126128.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6274934262037277,
      "epoch": 0.0546875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.857894736842106e-06,
      "loss": 0.0,
      "num_tokens": 130584.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8661725111305714,
      "epoch": 0.056640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24378839135169983,
      "learning_rate": 4.852631578947369e-06,
      "loss": 0.0,
      "num_tokens": 135048.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6145374327898026,
      "epoch": 0.05859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21723540127277374,
      "learning_rate": 4.847368421052631e-06,
      "loss": -0.0,
      "num_tokens": 139880.0,
      "reward": 0.3762750029563904,
      "reward_std": 0.519599974155426,
      "rewards/reward_func/mean": 0.3762750029563904,
      "rewards/reward_func/std": 0.49889329075813293,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.37807827070355415,
      "epoch": 0.060546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15673603117465973,
      "learning_rate": 4.842105263157895e-06,
      "loss": 0.0,
      "num_tokens": 144368.0,
      "reward": 0.34860000014305115,
      "reward_std": 0.5007524490356445,
      "rewards/reward_func/mean": 0.34860000014305115,
      "rewards/reward_func/std": 0.4811137020587921,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.44579252414405346,
      "epoch": 0.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17633330821990967,
      "learning_rate": 4.8368421052631585e-06,
      "loss": 0.0,
      "num_tokens": 149016.0,
      "reward": 0.4918999969959259,
      "reward_std": 0.5681688785552979,
      "rewards/reward_func/mean": 0.4918999969959259,
      "rewards/reward_func/std": 0.5260374546051025,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.46997769735753536,
      "epoch": 0.064453125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.831578947368422e-06,
      "loss": 0.0,
      "num_tokens": 153800.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5178538374602795,
      "epoch": 0.06640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18413496017456055,
      "learning_rate": 4.826315789473685e-06,
      "loss": 0.0,
      "num_tokens": 158472.0,
      "reward": 0.3675000071525574,
      "reward_std": 0.5236751437187195,
      "rewards/reward_func/mean": 0.3675000071525574,
      "rewards/reward_func/std": 0.5075360536575317,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49724796041846275,
      "epoch": 0.068359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14860549569129944,
      "learning_rate": 4.821052631578948e-06,
      "loss": 0.0,
      "num_tokens": 163656.0,
      "reward": 0.25,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7501945123076439,
      "epoch": 0.0703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22438675165176392,
      "learning_rate": 4.815789473684211e-06,
      "loss": 0.0,
      "num_tokens": 168360.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.2940751314163208,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8743213079869747,
      "epoch": 0.072265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2222360074520111,
      "learning_rate": 4.8105263157894735e-06,
      "loss": 0.0,
      "num_tokens": 172832.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5445486754179001,
      "epoch": 0.07421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20508722960948944,
      "learning_rate": 4.8052631578947375e-06,
      "loss": -0.0,
      "num_tokens": 177192.0,
      "reward": 0.25468748807907104,
      "reward_std": 0.49703317880630493,
      "rewards/reward_func/mean": 0.25468748807907104,
      "rewards/reward_func/std": 0.4601987898349762,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6154294572770596,
      "epoch": 0.076171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2429991364479065,
      "learning_rate": 4.800000000000001e-06,
      "loss": 0.0,
      "num_tokens": 181784.0,
      "reward": 0.6260000467300415,
      "reward_std": 0.5242137908935547,
      "rewards/reward_func/mean": 0.6259999871253967,
      "rewards/reward_func/std": 0.5048788785934448,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.469124692492187,
      "epoch": 0.078125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09975184500217438,
      "learning_rate": 4.794736842105264e-06,
      "loss": 0.0,
      "num_tokens": 186136.0,
      "reward": 0.375,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7508979439735413,
      "epoch": 0.080078125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.20132674276828766,
      "learning_rate": 4.789473684210527e-06,
      "loss": 0.0,
      "num_tokens": 190624.0,
      "reward": 0.8675000071525574,
      "reward_std": 0.2454078197479248,
      "rewards/reward_func/mean": 0.8675000071525574,
      "rewards/reward_func/std": 0.35115116834640503,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6970130242407322,
      "epoch": 0.08203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15729251503944397,
      "learning_rate": 4.78421052631579e-06,
      "loss": 0.0,
      "num_tokens": 195104.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7774740010499954,
      "epoch": 0.083984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2186206728219986,
      "learning_rate": 4.778947368421053e-06,
      "loss": 0.0,
      "num_tokens": 199936.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.43700238317251205,
      "epoch": 0.0859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.094396211206913,
      "learning_rate": 4.773684210526316e-06,
      "loss": -0.0,
      "num_tokens": 204736.0,
      "reward": 0.11949999630451202,
      "reward_std": 0.23899999260902405,
      "rewards/reward_func/mean": 0.11949999630451202,
      "rewards/reward_func/std": 0.33799704909324646,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.32771568559110165,
      "epoch": 0.087890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1980770379304886,
      "learning_rate": 4.76842105263158e-06,
      "loss": -0.0,
      "num_tokens": 209240.0,
      "reward": 0.3465000092983246,
      "reward_std": 0.49655240774154663,
      "rewards/reward_func/mean": 0.3465000092983246,
      "rewards/reward_func/std": 0.47824355959892273,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5444277804344893,
      "epoch": 0.08984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20421306788921356,
      "learning_rate": 4.763157894736842e-06,
      "loss": -0.0,
      "num_tokens": 214088.0,
      "reward": 0.25511249899864197,
      "reward_std": 0.29742252826690674,
      "rewards/reward_func/mean": 0.25511249899864197,
      "rewards/reward_func/std": 0.4534413814544678,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47741230949759483,
      "epoch": 0.091796875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1492561548948288,
      "learning_rate": 4.757894736842106e-06,
      "loss": 0.0,
      "num_tokens": 218680.0,
      "reward": 0.25,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7896424978971481,
      "epoch": 0.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23126615583896637,
      "learning_rate": 4.752631578947369e-06,
      "loss": 0.0,
      "num_tokens": 223168.0,
      "reward": 0.3675000071525574,
      "reward_std": 0.5302826166152954,
      "rewards/reward_func/mean": 0.3675000071525574,
      "rewards/reward_func/std": 0.5075360536575317,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6476297667250037,
      "epoch": 0.095703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18310336768627167,
      "learning_rate": 4.747368421052632e-06,
      "loss": 0.0,
      "num_tokens": 228344.0,
      "reward": 0.49729999899864197,
      "reward_std": 0.5742666125297546,
      "rewards/reward_func/mean": 0.49729999899864197,
      "rewards/reward_func/std": 0.531683087348938,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5440277047455311,
      "epoch": 0.09765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19677050411701202,
      "learning_rate": 4.7421052631578954e-06,
      "loss": 0.0,
      "num_tokens": 232792.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.43108594603836536,
      "epoch": 0.099609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1726936399936676,
      "learning_rate": 4.736842105263158e-06,
      "loss": 0.0,
      "num_tokens": 237968.0,
      "reward": 0.2593750059604645,
      "reward_std": 0.49439018964767456,
      "rewards/reward_func/mean": 0.2593750059604645,
      "rewards/reward_func/std": 0.45785558223724365,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8471032381057739,
      "epoch": 0.1015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1951897144317627,
      "learning_rate": 4.731578947368422e-06,
      "loss": 0.0,
      "num_tokens": 242688.0,
      "reward": 0.3828125,
      "reward_std": 0.5339096784591675,
      "rewards/reward_func/mean": 0.3828125,
      "rewards/reward_func/std": 0.5115163922309875,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9184307493269444,
      "epoch": 0.103515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27299290895462036,
      "learning_rate": 4.726315789473684e-06,
      "loss": 0.0,
      "num_tokens": 247168.0,
      "reward": 0.3812499940395355,
      "reward_std": 0.5347907543182373,
      "rewards/reward_func/mean": 0.3812499940395355,
      "rewards/reward_func/std": 0.5126523971557617,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6415830589830875,
      "epoch": 0.10546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22410304844379425,
      "learning_rate": 4.721052631578948e-06,
      "loss": 0.0,
      "num_tokens": 251656.0,
      "reward": 0.5,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5345224738121033,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8139438554644585,
      "epoch": 0.107421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22878246009349823,
      "learning_rate": 4.71578947368421e-06,
      "loss": 0.0,
      "num_tokens": 256368.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7665090560913086,
      "epoch": 0.109375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13540033996105194,
      "learning_rate": 4.710526315789474e-06,
      "loss": 0.0,
      "num_tokens": 260800.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7670854441821575,
      "epoch": 0.111328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20830559730529785,
      "learning_rate": 4.705263157894738e-06,
      "loss": 0.0,
      "num_tokens": 265296.0,
      "reward": 0.5015624761581421,
      "reward_std": 0.5755573511123657,
      "rewards/reward_func/mean": 0.5015624761581421,
      "rewards/reward_func/std": 0.5328678488731384,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4469104828312993,
      "epoch": 0.11328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20230792462825775,
      "learning_rate": 4.7e-06,
      "loss": 0.0,
      "num_tokens": 269616.0,
      "reward": 0.25468748807907104,
      "reward_std": 0.49703317880630493,
      "rewards/reward_func/mean": 0.25468748807907104,
      "rewards/reward_func/std": 0.4601987898349762,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.43488175235688686,
      "epoch": 0.115234375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1155906692147255,
      "learning_rate": 4.694736842105264e-06,
      "loss": 0.0,
      "num_tokens": 274400.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/reward_func/mean": 0.0031250000465661287,
      "rewards/reward_func/std": 0.008838835172355175,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.538303017616272,
      "epoch": 0.1171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1079397052526474,
      "learning_rate": 4.689473684210526e-06,
      "loss": 0.0,
      "num_tokens": 279360.0,
      "reward": 0.11620000004768372,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.11620000004768372,
      "rewards/reward_func/std": 0.32866325974464417,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4871381111443043,
      "epoch": 0.119140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20236212015151978,
      "learning_rate": 4.68421052631579e-06,
      "loss": 0.0,
      "num_tokens": 284272.0,
      "reward": 0.0234375,
      "reward_std": 0.04291563481092453,
      "rewards/reward_func/mean": 0.0234375,
      "rewards/reward_func/std": 0.039774756878614426,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.779204398393631,
      "epoch": 0.12109375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14050957560539246,
      "learning_rate": 4.6789473684210525e-06,
      "loss": 0.0,
      "num_tokens": 288760.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5929105505347252,
      "epoch": 0.123046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22070521116256714,
      "learning_rate": 4.6736842105263166e-06,
      "loss": 0.0,
      "num_tokens": 293440.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5897692143917084,
      "epoch": 0.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1809045523405075,
      "learning_rate": 4.668421052631579e-06,
      "loss": 0.0,
      "num_tokens": 298104.0,
      "reward": 0.6265624761581421,
      "reward_std": 0.5368822813034058,
      "rewards/reward_func/mean": 0.6265624761581421,
      "rewards/reward_func/std": 0.515407145023346,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5395209770649672,
      "epoch": 0.126953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21770712733268738,
      "learning_rate": 4.663157894736842e-06,
      "loss": 0.0,
      "num_tokens": 302952.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5718973986804485,
      "epoch": 0.12890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19175319373607635,
      "learning_rate": 4.657894736842106e-06,
      "loss": 0.0,
      "num_tokens": 307576.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3761954288929701,
      "epoch": 0.130859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1708212047815323,
      "learning_rate": 4.652631578947368e-06,
      "loss": -0.0,
      "num_tokens": 312424.0,
      "reward": 0.24729999899864197,
      "reward_std": 0.49459999799728394,
      "rewards/reward_func/mean": 0.24729999899864197,
      "rewards/reward_func/std": 0.4579469859600067,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4973205029964447,
      "epoch": 0.1328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17592303454875946,
      "learning_rate": 4.647368421052632e-06,
      "loss": 0.0,
      "num_tokens": 317264.0,
      "reward": 0.24459999799728394,
      "reward_std": 0.48919999599456787,
      "rewards/reward_func/mean": 0.24459999799728394,
      "rewards/reward_func/std": 0.45291122794151306,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8153446540236473,
      "epoch": 0.134765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2120014876127243,
      "learning_rate": 4.642105263157895e-06,
      "loss": 0.0,
      "num_tokens": 321760.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.724060770124197,
      "epoch": 0.13671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22452718019485474,
      "learning_rate": 4.636842105263159e-06,
      "loss": -0.0,
      "num_tokens": 326440.0,
      "reward": 0.25,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5170768238604069,
      "epoch": 0.138671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12857870757579803,
      "learning_rate": 4.631578947368421e-06,
      "loss": 0.0,
      "num_tokens": 331632.0,
      "reward": 0.25,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.688769031316042,
      "epoch": 0.140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22918672859668732,
      "learning_rate": 4.626315789473684e-06,
      "loss": 0.0,
      "num_tokens": 336328.0,
      "reward": 0.614799976348877,
      "reward_std": 0.5321913957595825,
      "rewards/reward_func/mean": 0.614799976348877,
      "rewards/reward_func/std": 0.5094863176345825,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8502191081643105,
      "epoch": 0.142578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23828759789466858,
      "learning_rate": 4.621052631578948e-06,
      "loss": 0.0,
      "num_tokens": 340912.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8155621662735939,
      "epoch": 0.14453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20899727940559387,
      "learning_rate": 4.6157894736842105e-06,
      "loss": 0.0,
      "num_tokens": 345640.0,
      "reward": 0.5,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5345224738121033,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5054981037974358,
      "epoch": 0.146484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13521376252174377,
      "learning_rate": 4.6105263157894745e-06,
      "loss": 0.0,
      "num_tokens": 350272.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7490544784814119,
      "epoch": 0.1484375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.605263157894737e-06,
      "loss": 0.0,
      "num_tokens": 354896.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6609128974378109,
      "epoch": 0.150390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2172318398952484,
      "learning_rate": 4.600000000000001e-06,
      "loss": 0.0,
      "num_tokens": 359656.0,
      "reward": 0.36480000615119934,
      "reward_std": 0.5248825550079346,
      "rewards/reward_func/mean": 0.36480000615119934,
      "rewards/reward_func/std": 0.5037338137626648,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.629855677485466,
      "epoch": 0.15234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2275475412607193,
      "learning_rate": 4.594736842105263e-06,
      "loss": -0.0,
      "num_tokens": 363984.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.570318341255188,
      "rewards/reward_func/mean": 0.5062500238418579,
      "rewards/reward_func/std": 0.5280946493148804,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6377979442477226,
      "epoch": 0.154296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18542706966400146,
      "learning_rate": 4.589473684210526e-06,
      "loss": 0.0,
      "num_tokens": 368672.0,
      "reward": 0.5066750049591064,
      "reward_std": 0.4890046715736389,
      "rewards/reward_func/mean": 0.5066750049591064,
      "rewards/reward_func/std": 0.5222390294075012,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6481011174619198,
      "epoch": 0.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1995760053396225,
      "learning_rate": 4.5842105263157895e-06,
      "loss": 0.0,
      "num_tokens": 373080.0,
      "reward": 0.5015624761581421,
      "reward_std": 0.5755573511123657,
      "rewards/reward_func/mean": 0.5015624761581421,
      "rewards/reward_func/std": 0.5328678488731384,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5756206288933754,
      "epoch": 0.158203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18328534066677094,
      "learning_rate": 4.578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 377520.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.564460570923984,
      "epoch": 0.16015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16234658658504486,
      "learning_rate": 4.573684210526317e-06,
      "loss": 0.0,
      "num_tokens": 382704.0,
      "reward": 0.2447499930858612,
      "reward_std": 0.4894999861717224,
      "rewards/reward_func/mean": 0.2447499930858612,
      "rewards/reward_func/std": 0.4533279538154602,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4652852639555931,
      "epoch": 0.162109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18636715412139893,
      "learning_rate": 4.568421052631579e-06,
      "loss": -0.0,
      "num_tokens": 387048.0,
      "reward": 0.3812499940395355,
      "reward_std": 0.531643271446228,
      "rewards/reward_func/mean": 0.3812499940395355,
      "rewards/reward_func/std": 0.5126523971557617,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5575841926038265,
      "epoch": 0.1640625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14538714289665222,
      "learning_rate": 4.563157894736843e-06,
      "loss": 0.0,
      "num_tokens": 391904.0,
      "reward": 0.25241249799728394,
      "reward_std": 0.27352577447891235,
      "rewards/reward_func/mean": 0.25241249799728394,
      "rewards/reward_func/std": 0.44840875267982483,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8532749712467194,
      "epoch": 0.166015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20851509273052216,
      "learning_rate": 4.557894736842105e-06,
      "loss": 0.0,
      "num_tokens": 396616.0,
      "reward": 0.48980000615119934,
      "reward_std": 0.48325222730636597,
      "rewards/reward_func/mean": 0.48980000615119934,
      "rewards/reward_func/std": 0.5239458084106445,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4446020573377609,
      "epoch": 0.16796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1834844946861267,
      "learning_rate": 4.552631578947369e-06,
      "loss": 0.0,
      "num_tokens": 401784.0,
      "reward": 0.12443750351667404,
      "reward_std": 0.24887500703334808,
      "rewards/reward_func/mean": 0.12443750351667404,
      "rewards/reward_func/std": 0.3370656669139862,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8720123656094074,
      "epoch": 0.169921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19286687672138214,
      "learning_rate": 4.547368421052632e-06,
      "loss": 0.0,
      "num_tokens": 406488.0,
      "reward": 0.48500001430511475,
      "reward_std": 0.4904162883758545,
      "rewards/reward_func/mean": 0.48500001430511475,
      "rewards/reward_func/std": 0.5189825296401978,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6445259749889374,
      "epoch": 0.171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18657498061656952,
      "learning_rate": 4.542105263157895e-06,
      "loss": 0.0,
      "num_tokens": 411192.0,
      "reward": 0.8723000288009644,
      "reward_std": 0.25540000200271606,
      "rewards/reward_func/mean": 0.8723000288009644,
      "rewards/reward_func/std": 0.3525434732437134,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6623015515506268,
      "epoch": 0.173828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21270084381103516,
      "learning_rate": 4.536842105263158e-06,
      "loss": -0.0,
      "num_tokens": 415672.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.623845137655735,
      "epoch": 0.17578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2535983622074127,
      "learning_rate": 4.531578947368421e-06,
      "loss": 0.0,
      "num_tokens": 419992.0,
      "reward": 0.390625,
      "reward_std": 0.5289278030395508,
      "rewards/reward_func/mean": 0.390625,
      "rewards/reward_func/std": 0.5054501891136169,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5036896169185638,
      "epoch": 0.177734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16919977962970734,
      "learning_rate": 4.526315789473685e-06,
      "loss": 0.0,
      "num_tokens": 424840.0,
      "reward": 0.3747124969959259,
      "reward_std": 0.5183161497116089,
      "rewards/reward_func/mean": 0.3747124969959259,
      "rewards/reward_func/std": 0.5003470778465271,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5636924169957638,
      "epoch": 0.1796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22071091830730438,
      "learning_rate": 4.5210526315789475e-06,
      "loss": 0.0,
      "num_tokens": 429592.0,
      "reward": 0.3709375262260437,
      "reward_std": 0.5032609701156616,
      "rewards/reward_func/mean": 0.3709375262260437,
      "rewards/reward_func/std": 0.4890368580818176,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.609756950289011,
      "epoch": 0.181640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19224682450294495,
      "learning_rate": 4.5157894736842115e-06,
      "loss": 0.0,
      "num_tokens": 433952.0,
      "reward": 0.5015624761581421,
      "reward_std": 0.5755573511123657,
      "rewards/reward_func/mean": 0.5015624761581421,
      "rewards/reward_func/std": 0.5328678488731384,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.40662568248808384,
      "epoch": 0.18359375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.510526315789474e-06,
      "loss": 0.0,
      "num_tokens": 439112.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7100704610347748,
      "epoch": 0.185546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23149752616882324,
      "learning_rate": 4.505263157894737e-06,
      "loss": 0.0,
      "num_tokens": 443864.0,
      "reward": 0.35249999165534973,
      "reward_std": 0.5063546299934387,
      "rewards/reward_func/mean": 0.35249999165534973,
      "rewards/reward_func/std": 0.4864962100982666,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8094519749283791,
      "epoch": 0.1875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15361107885837555,
      "learning_rate": 4.5e-06,
      "loss": 0.0,
      "num_tokens": 448584.0,
      "reward": 0.8723000288009644,
      "reward_std": 0.24825221300125122,
      "rewards/reward_func/mean": 0.8723000288009644,
      "rewards/reward_func/std": 0.3525434732437134,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7366169281303883,
      "epoch": 0.189453125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.150665283203125,
      "learning_rate": 4.494736842105263e-06,
      "loss": 0.0,
      "num_tokens": 453272.0,
      "reward": 0.25,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6502304151654243,
      "epoch": 0.19140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20246288180351257,
      "learning_rate": 4.489473684210527e-06,
      "loss": -0.0,
      "num_tokens": 458144.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.4966987073421478,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5997471921145916,
      "epoch": 0.193359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20762783288955688,
      "learning_rate": 4.48421052631579e-06,
      "loss": 0.0,
      "num_tokens": 463016.0,
      "reward": 0.6228749752044678,
      "reward_std": 0.5304585099220276,
      "rewards/reward_func/mean": 0.6228749752044678,
      "rewards/reward_func/std": 0.5091015100479126,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7601636275649071,
      "epoch": 0.1953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24827490746974945,
      "learning_rate": 4.478947368421054e-06,
      "loss": 0.0,
      "num_tokens": 467464.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5260781534016132,
      "epoch": 0.197265625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15311233699321747,
      "learning_rate": 4.473684210526316e-06,
      "loss": -0.0,
      "num_tokens": 472096.0,
      "reward": 0.0015625000232830644,
      "reward_std": 0.0031250000465661287,
      "rewards/reward_func/mean": 0.0015625000232830644,
      "rewards/reward_func/std": 0.0044194175861775875,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7099028006196022,
      "epoch": 0.19921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23158502578735352,
      "learning_rate": 4.468421052631579e-06,
      "loss": 0.0,
      "num_tokens": 476832.0,
      "reward": 0.5016124844551086,
      "reward_std": 0.5615566968917847,
      "rewards/reward_func/mean": 0.5016124844551086,
      "rewards/reward_func/std": 0.5200324058532715,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6568261608481407,
      "epoch": 0.201171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1539800614118576,
      "learning_rate": 4.463157894736842e-06,
      "loss": 0.0,
      "num_tokens": 481592.0,
      "reward": 0.8300000429153442,
      "reward_std": 0.24041630327701569,
      "rewards/reward_func/mean": 0.8300000429153442,
      "rewards/reward_func/std": 0.33602720499038696,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48168842773884535,
      "epoch": 0.203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19704312086105347,
      "learning_rate": 4.4578947368421054e-06,
      "loss": -0.0,
      "num_tokens": 486440.0,
      "reward": 0.3668999969959259,
      "reward_std": 0.5270397663116455,
      "rewards/reward_func/mean": 0.3668999969959259,
      "rewards/reward_func/std": 0.5063701272010803,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4500679522752762,
      "epoch": 0.205078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15499116480350494,
      "learning_rate": 4.452631578947369e-06,
      "loss": -0.0,
      "num_tokens": 491224.0,
      "reward": 0.12262499332427979,
      "reward_std": 0.24110156297683716,
      "rewards/reward_func/mean": 0.12262499332427979,
      "rewards/reward_func/std": 0.33678168058395386,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6883682459592819,
      "epoch": 0.20703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12739039957523346,
      "learning_rate": 4.447368421052632e-06,
      "loss": 0.0,
      "num_tokens": 495864.0,
      "reward": 0.25,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5138243455439806,
      "epoch": 0.208984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20175091922283173,
      "learning_rate": 4.442105263157896e-06,
      "loss": -0.0,
      "num_tokens": 500720.0,
      "reward": 0.4923250079154968,
      "reward_std": 0.5613177418708801,
      "rewards/reward_func/mean": 0.49232497811317444,
      "rewards/reward_func/std": 0.5197004675865173,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7696716003119946,
      "epoch": 0.2109375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.19667677581310272,
      "learning_rate": 4.436842105263158e-06,
      "loss": 0.0,
      "num_tokens": 505320.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6834167242050171,
      "epoch": 0.212890625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.19136440753936768,
      "learning_rate": 4.431578947368421e-06,
      "loss": 0.0,
      "num_tokens": 509792.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5321553461253643,
      "epoch": 0.21484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2594543993473053,
      "learning_rate": 4.426315789473684e-06,
      "loss": -0.0,
      "num_tokens": 514392.0,
      "reward": 0.629687488079071,
      "reward_std": 0.5333658456802368,
      "rewards/reward_func/mean": 0.629687488079071,
      "rewards/reward_func/std": 0.5112107992172241,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6744099333882332,
      "epoch": 0.216796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20642893016338348,
      "learning_rate": 4.4210526315789476e-06,
      "loss": -0.0,
      "num_tokens": 519056.0,
      "reward": 0.375,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.375,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6416698023676872,
      "epoch": 0.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22189435362815857,
      "learning_rate": 4.415789473684211e-06,
      "loss": 0.0,
      "num_tokens": 524064.0,
      "reward": 0.23100000619888306,
      "reward_std": 0.4619999825954437,
      "rewards/reward_func/mean": 0.23100000619888306,
      "rewards/reward_func/std": 0.42808806896209717,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6506899148225784,
      "epoch": 0.220703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24659857153892517,
      "learning_rate": 4.410526315789474e-06,
      "loss": 0.0,
      "num_tokens": 529056.0,
      "reward": 0.3370000123977661,
      "reward_std": 0.483335018157959,
      "rewards/reward_func/mean": 0.3370000123977661,
      "rewards/reward_func/std": 0.46515026688575745,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6117045246064663,
      "epoch": 0.22265625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15469233691692352,
      "learning_rate": 4.405263157894737e-06,
      "loss": 0.0,
      "num_tokens": 533664.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.681044191122055,
      "epoch": 0.224609375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1914876252412796,
      "learning_rate": 4.4e-06,
      "loss": 0.0,
      "num_tokens": 538104.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6017662808299065,
      "epoch": 0.2265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2209053784608841,
      "learning_rate": 4.394736842105263e-06,
      "loss": 0.0,
      "num_tokens": 543024.0,
      "reward": 0.4899500012397766,
      "reward_std": 0.5516552925109863,
      "rewards/reward_func/mean": 0.4899500012397766,
      "rewards/reward_func/std": 0.5108809471130371,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6683652102947235,
      "epoch": 0.228515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19945447146892548,
      "learning_rate": 4.3894736842105266e-06,
      "loss": -0.0,
      "num_tokens": 547656.0,
      "reward": 0.3683124780654907,
      "reward_std": 0.5250316858291626,
      "rewards/reward_func/mean": 0.3683124780654907,
      "rewards/reward_func/std": 0.5048869252204895,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3754726406186819,
      "epoch": 0.23046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19966642558574677,
      "learning_rate": 4.38421052631579e-06,
      "loss": 0.0,
      "num_tokens": 552160.0,
      "reward": 0.47360000014305115,
      "reward_std": 0.47124379873275757,
      "rewards/reward_func/mean": 0.47360000014305115,
      "rewards/reward_func/std": 0.5068238377571106,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4965004436671734,
      "epoch": 0.232421875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1419457346200943,
      "learning_rate": 4.378947368421053e-06,
      "loss": 0.0,
      "num_tokens": 557144.0,
      "reward": 0.11100000143051147,
      "reward_std": 0.22200000286102295,
      "rewards/reward_func/mean": 0.11100000143051147,
      "rewards/reward_func/std": 0.3139553964138031,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.509646050632,
      "epoch": 0.234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18533200025558472,
      "learning_rate": 4.373684210526316e-06,
      "loss": -0.0,
      "num_tokens": 561792.0,
      "reward": 0.37892499566078186,
      "reward_std": 0.5219879150390625,
      "rewards/reward_func/mean": 0.37892499566078186,
      "rewards/reward_func/std": 0.5029488205909729,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6187039539217949,
      "epoch": 0.236328125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14618447422981262,
      "learning_rate": 4.368421052631579e-06,
      "loss": 0.0,
      "num_tokens": 566528.0,
      "reward": 0.9474999904632568,
      "reward_std": 0.01500000525265932,
      "rewards/reward_func/mean": 0.9474999904632568,
      "rewards/reward_func/std": 0.02121320553123951,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4034488648176193,
      "epoch": 0.23828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17633076012134552,
      "learning_rate": 4.363157894736842e-06,
      "loss": 0.0,
      "num_tokens": 571704.0,
      "reward": 0.7593749761581421,
      "reward_std": 0.48124998807907104,
      "rewards/reward_func/mean": 0.7593749761581421,
      "rewards/reward_func/std": 0.4460016191005707,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6987418737262487,
      "epoch": 0.240234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26086556911468506,
      "learning_rate": 4.3578947368421055e-06,
      "loss": -0.0,
      "num_tokens": 576560.0,
      "reward": 0.6266999840736389,
      "reward_std": 0.5093674659729004,
      "rewards/reward_func/mean": 0.6266999840736389,
      "rewards/reward_func/std": 0.4916507303714752,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7009507194161415,
      "epoch": 0.2421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2221718728542328,
      "learning_rate": 4.352631578947369e-06,
      "loss": -0.0,
      "num_tokens": 581488.0,
      "reward": 0.49502497911453247,
      "reward_std": 0.5644694566726685,
      "rewards/reward_func/mean": 0.49502497911453247,
      "rewards/reward_func/std": 0.5226343870162964,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3089421261101961,
      "epoch": 0.244140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16174942255020142,
      "learning_rate": 4.347368421052632e-06,
      "loss": 0.0,
      "num_tokens": 585992.0,
      "reward": 0.6930000185966492,
      "reward_std": 0.4620679020881653,
      "rewards/reward_func/mean": 0.6930000185966492,
      "rewards/reward_func/std": 0.4277917146682739,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6322880983352661,
      "epoch": 0.24609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19183886051177979,
      "learning_rate": 4.342105263157895e-06,
      "loss": -0.0,
      "num_tokens": 590960.0,
      "reward": 0.2251249998807907,
      "reward_std": 0.25995194911956787,
      "rewards/reward_func/mean": 0.2251250147819519,
      "rewards/reward_func/std": 0.4091717302799225,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8441327884793282,
      "epoch": 0.248046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22232002019882202,
      "learning_rate": 4.336842105263158e-06,
      "loss": -0.0,
      "num_tokens": 595496.0,
      "reward": 0.37229999899864197,
      "reward_std": 0.5355914831161499,
      "rewards/reward_func/mean": 0.37229999899864197,
      "rewards/reward_func/std": 0.5138660669326782,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3662140220403671,
      "epoch": 0.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19537632167339325,
      "learning_rate": 4.331578947368421e-06,
      "loss": 0.0,
      "num_tokens": 599992.0,
      "reward": 0.4710499942302704,
      "reward_std": 0.46093741059303284,
      "rewards/reward_func/mean": 0.4710499942302704,
      "rewards/reward_func/std": 0.49048370122909546,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6109810192137957,
      "epoch": 0.251953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20167317986488342,
      "learning_rate": 4.3263157894736845e-06,
      "loss": 0.0,
      "num_tokens": 604848.0,
      "reward": 0.25439998507499695,
      "reward_std": 0.47605061531066895,
      "rewards/reward_func/mean": 0.25439998507499695,
      "rewards/reward_func/std": 0.44077494740486145,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.765848021954298,
      "epoch": 0.25390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2271883189678192,
      "learning_rate": 4.321052631578948e-06,
      "loss": 0.0,
      "num_tokens": 609472.0,
      "reward": 0.49524998664855957,
      "reward_std": 0.48512208461761475,
      "rewards/reward_func/mean": 0.49524998664855957,
      "rewards/reward_func/std": 0.5163409113883972,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6323870681226254,
      "epoch": 0.255859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.204331636428833,
      "learning_rate": 4.315789473684211e-06,
      "loss": 0.0,
      "num_tokens": 613824.0,
      "reward": 0.7515624761581421,
      "reward_std": 0.49687498807907104,
      "rewards/reward_func/mean": 0.7515624761581421,
      "rewards/reward_func/std": 0.4600290060043335,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6582241244614124,
      "epoch": 0.2578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23651830852031708,
      "learning_rate": 4.310526315789474e-06,
      "loss": -0.0,
      "num_tokens": 618512.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6236909478902817,
      "epoch": 0.259765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1894795149564743,
      "learning_rate": 4.305263157894737e-06,
      "loss": 0.0,
      "num_tokens": 623488.0,
      "reward": 0.2150000035762787,
      "reward_std": 0.4300000071525574,
      "rewards/reward_func/mean": 0.2150000035762787,
      "rewards/reward_func/std": 0.39838388562202454,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.23316527949646115,
      "epoch": 0.26171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.14522254467010498,
      "learning_rate": 4.3e-06,
      "loss": 0.0,
      "num_tokens": 627992.0,
      "reward": 0.3499000072479248,
      "reward_std": 0.5033524036407471,
      "rewards/reward_func/mean": 0.3499000072479248,
      "rewards/reward_func/std": 0.48291856050491333,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5633776243776083,
      "epoch": 0.263671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14741557836532593,
      "learning_rate": 4.2947368421052635e-06,
      "loss": 0.0,
      "num_tokens": 632352.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5170593503862619,
      "epoch": 0.265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20114511251449585,
      "learning_rate": 4.289473684210527e-06,
      "loss": 0.0,
      "num_tokens": 637200.0,
      "reward": 0.2691749930381775,
      "reward_std": 0.28962743282318115,
      "rewards/reward_func/mean": 0.2691749930381775,
      "rewards/reward_func/std": 0.44533297419548035,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.35685587860643864,
      "epoch": 0.267578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17862118780612946,
      "learning_rate": 4.28421052631579e-06,
      "loss": -0.0,
      "num_tokens": 641688.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.46480000019073486,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8764316439628601,
      "epoch": 0.26953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22691944241523743,
      "learning_rate": 4.278947368421053e-06,
      "loss": 0.0,
      "num_tokens": 646184.0,
      "reward": 0.49729999899864197,
      "reward_std": 0.5742666125297546,
      "rewards/reward_func/mean": 0.49729999899864197,
      "rewards/reward_func/std": 0.531683087348938,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.703185360878706,
      "epoch": 0.271484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21261753141880035,
      "learning_rate": 4.273684210526316e-06,
      "loss": 0.0,
      "num_tokens": 651160.0,
      "reward": 0.4440000057220459,
      "reward_std": 0.5126870274543762,
      "rewards/reward_func/mean": 0.4440000057220459,
      "rewards/reward_func/std": 0.47465598583221436,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7478283271193504,
      "epoch": 0.2734375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15642641484737396,
      "learning_rate": 4.268421052631579e-06,
      "loss": 0.0,
      "num_tokens": 656016.0,
      "reward": 0.11140000075101852,
      "reward_std": 0.22280000150203705,
      "rewards/reward_func/mean": 0.11140000075101852,
      "rewards/reward_func/std": 0.3150867819786072,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.507564508356154,
      "epoch": 0.275390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15584400296211243,
      "learning_rate": 4.2631578947368425e-06,
      "loss": 0.0,
      "num_tokens": 660816.0,
      "reward": 0.24137499928474426,
      "reward_std": 0.45809125900268555,
      "rewards/reward_func/mean": 0.24137499928474426,
      "rewards/reward_func/std": 0.42415857315063477,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6022160463035107,
      "epoch": 0.27734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20966751873493195,
      "learning_rate": 4.257894736842106e-06,
      "loss": -0.0,
      "num_tokens": 665656.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.2991751432418823,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9077504724264145,
      "epoch": 0.279296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27349957823753357,
      "learning_rate": 4.252631578947369e-06,
      "loss": 0.0,
      "num_tokens": 670144.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5460263751447201,
      "epoch": 0.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17890246212482452,
      "learning_rate": 4.247368421052632e-06,
      "loss": 0.0,
      "num_tokens": 674832.0,
      "reward": 0.503125011920929,
      "reward_std": 0.49798667430877686,
      "rewards/reward_func/mean": 0.503125011920929,
      "rewards/reward_func/std": 0.5312447547912598,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6752507574856281,
      "epoch": 0.283203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17742383480072021,
      "learning_rate": 4.242105263157895e-06,
      "loss": 0.0,
      "num_tokens": 679176.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6691841837018728,
      "epoch": 0.28515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2150551676750183,
      "learning_rate": 4.236842105263158e-06,
      "loss": -0.0,
      "num_tokens": 683968.0,
      "reward": 0.7343499660491943,
      "reward_std": 0.49009713530540466,
      "rewards/reward_func/mean": 0.7343499660491943,
      "rewards/reward_func/std": 0.45377910137176514,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.970278836786747,
      "epoch": 0.287109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27998873591423035,
      "learning_rate": 4.2315789473684215e-06,
      "loss": -0.0,
      "num_tokens": 688464.0,
      "reward": 0.37229999899864197,
      "reward_std": 0.5355914831161499,
      "rewards/reward_func/mean": 0.37229999899864197,
      "rewards/reward_func/std": 0.5138660669326782,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.38460469990968704,
      "epoch": 0.2890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24717585742473602,
      "learning_rate": 4.226315789473685e-06,
      "loss": 0.0,
      "num_tokens": 693256.0,
      "reward": 0.36281248927116394,
      "reward_std": 0.5174503326416016,
      "rewards/reward_func/mean": 0.36281248927116394,
      "rewards/reward_func/std": 0.4976207911968231,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8782265558838844,
      "epoch": 0.291015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23354728519916534,
      "learning_rate": 4.221052631578948e-06,
      "loss": 0.0,
      "num_tokens": 697904.0,
      "reward": 0.3670499920845032,
      "reward_std": 0.5250914692878723,
      "rewards/reward_func/mean": 0.3670499920845032,
      "rewards/reward_func/std": 0.5067015290260315,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6834246851503849,
      "epoch": 0.29296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2545972764492035,
      "learning_rate": 4.215789473684211e-06,
      "loss": -0.0,
      "num_tokens": 702248.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.2991751432418823,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6247103922069073,
      "epoch": 0.294921875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14536061882972717,
      "learning_rate": 4.210526315789474e-06,
      "loss": 0.0,
      "num_tokens": 707120.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5234713517129421,
      "epoch": 0.296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2018449902534485,
      "learning_rate": 4.205263157894737e-06,
      "loss": 0.0,
      "num_tokens": 711768.0,
      "reward": 0.6231000423431396,
      "reward_std": 0.5223661661148071,
      "rewards/reward_func/mean": 0.6230999827384949,
      "rewards/reward_func/std": 0.5024495124816895,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4138908423483372,
      "epoch": 0.298828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1805502027273178,
      "learning_rate": 4.2000000000000004e-06,
      "loss": -0.0,
      "num_tokens": 716568.0,
      "reward": 0.3546624779701233,
      "reward_std": 0.5078586935997009,
      "rewards/reward_func/mean": 0.3546624779701233,
      "rewards/reward_func/std": 0.48623159527778625,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5758881866931915,
      "epoch": 0.30078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23512139916419983,
      "learning_rate": 4.194736842105264e-06,
      "loss": 0.0,
      "num_tokens": 721352.0,
      "reward": 0.7323000431060791,
      "reward_std": 0.4886685013771057,
      "rewards/reward_func/mean": 0.7323000431060791,
      "rewards/reward_func/std": 0.45266833901405334,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47453245986253023,
      "epoch": 0.302734375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15655399858951569,
      "learning_rate": 4.189473684210527e-06,
      "loss": 0.0,
      "num_tokens": 725864.0,
      "reward": 0.6951000094413757,
      "reward_std": 0.26594966650009155,
      "rewards/reward_func/mean": 0.6951000094413757,
      "rewards/reward_func/std": 0.42906418442726135,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7080131396651268,
      "epoch": 0.3046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.20565909147262573,
      "learning_rate": 4.18421052631579e-06,
      "loss": 0.0,
      "num_tokens": 730704.0,
      "reward": 0.23100000619888306,
      "reward_std": 0.2670717239379883,
      "rewards/reward_func/mean": 0.23100000619888306,
      "rewards/reward_func/std": 0.42808806896209717,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.0115558728575706,
      "epoch": 0.306640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7812044024467468,
      "learning_rate": 4.178947368421053e-06,
      "loss": 0.0,
      "num_tokens": 735200.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5327462665736675,
      "epoch": 0.30859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18726424872875214,
      "learning_rate": 4.173684210526316e-06,
      "loss": -0.0,
      "num_tokens": 739856.0,
      "reward": 0.3703624904155731,
      "reward_std": 0.5275677442550659,
      "rewards/reward_func/mean": 0.3703624904155731,
      "rewards/reward_func/std": 0.5077766180038452,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7538893632590771,
      "epoch": 0.310546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2491677850484848,
      "learning_rate": 4.1684210526315794e-06,
      "loss": 0.0,
      "num_tokens": 744640.0,
      "reward": 0.24524998664855957,
      "reward_std": 0.2810765504837036,
      "rewards/reward_func/mean": 0.24525000154972076,
      "rewards/reward_func/std": 0.43876922130584717,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.440243948251009,
      "epoch": 0.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2969156801700592,
      "learning_rate": 4.163157894736843e-06,
      "loss": 0.0,
      "num_tokens": 749424.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/reward_func/mean": 0.00937500037252903,
      "rewards/reward_func/std": 0.018600596114993095,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5540930740535259,
      "epoch": 0.314453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2284710556268692,
      "learning_rate": 4.157894736842106e-06,
      "loss": 0.0,
      "num_tokens": 754280.0,
      "reward": 0.4969000220298767,
      "reward_std": 0.47432881593704224,
      "rewards/reward_func/mean": 0.4968999922275543,
      "rewards/reward_func/std": 0.5051693916320801,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5730804577469826,
      "epoch": 0.31640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21612071990966797,
      "learning_rate": 4.152631578947369e-06,
      "loss": -0.0,
      "num_tokens": 759216.0,
      "reward": 0.1597999930381775,
      "reward_std": 0.251237154006958,
      "rewards/reward_func/mean": 0.1597999930381775,
      "rewards/reward_func/std": 0.3319000005722046,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9729300402104855,
      "epoch": 0.318359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26729077100753784,
      "learning_rate": 4.147368421052632e-06,
      "loss": -0.0,
      "num_tokens": 763944.0,
      "reward": 0.7394999861717224,
      "reward_std": 0.30079948902130127,
      "rewards/reward_func/mean": 0.7394999861717224,
      "rewards/reward_func/std": 0.4567972421646118,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5482957363128662,
      "epoch": 0.3203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.19377972185611725,
      "learning_rate": 4.142105263157895e-06,
      "loss": 0.0,
      "num_tokens": 768392.0,
      "reward": 0.8723000288009644,
      "reward_std": 0.24825221300125122,
      "rewards/reward_func/mean": 0.8723000288009644,
      "rewards/reward_func/std": 0.3525434732437134,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6647583916783333,
      "epoch": 0.322265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21792468428611755,
      "learning_rate": 4.136842105263158e-06,
      "loss": -0.0,
      "num_tokens": 773112.0,
      "reward": 0.6197500228881836,
      "reward_std": 0.2605000138282776,
      "rewards/reward_func/mean": 0.6197500228881836,
      "rewards/reward_func/std": 0.5133981704711914,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5656867567449808,
      "epoch": 0.32421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2745096683502197,
      "learning_rate": 4.1315789473684216e-06,
      "loss": 0.0,
      "num_tokens": 778024.0,
      "reward": 0.3997125029563904,
      "reward_std": 0.4995589852333069,
      "rewards/reward_func/mean": 0.3997125029563904,
      "rewards/reward_func/std": 0.47963643074035645,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5817390941083431,
      "epoch": 0.326171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1972116380929947,
      "learning_rate": 4.126315789473685e-06,
      "loss": -0.0,
      "num_tokens": 782976.0,
      "reward": 0.0015625000232830644,
      "reward_std": 0.0031250000465661287,
      "rewards/reward_func/mean": 0.0015625000232830644,
      "rewards/reward_func/std": 0.0044194175861775875,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6987337954342365,
      "epoch": 0.328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20925994217395782,
      "learning_rate": 4.121052631578948e-06,
      "loss": -0.0,
      "num_tokens": 787984.0,
      "reward": 0.35600000619888306,
      "reward_std": 0.5124994516372681,
      "rewards/reward_func/mean": 0.35600000619888306,
      "rewards/reward_func/std": 0.4916515350341797,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9033612161874771,
      "epoch": 0.330078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2295723259449005,
      "learning_rate": 4.115789473684211e-06,
      "loss": 0.0,
      "num_tokens": 792344.0,
      "reward": 0.6353750228881836,
      "reward_std": 0.5139543414115906,
      "rewards/reward_func/mean": 0.6353750228881836,
      "rewards/reward_func/std": 0.4924015998840332,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7016931138932705,
      "epoch": 0.33203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20966175198554993,
      "learning_rate": 4.110526315789474e-06,
      "loss": 0.0,
      "num_tokens": 797032.0,
      "reward": 0.5,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5345224738121033,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6281918026506901,
      "epoch": 0.333984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20377099514007568,
      "learning_rate": 4.105263157894737e-06,
      "loss": 0.0,
      "num_tokens": 801688.0,
      "reward": 0.7515624761581421,
      "reward_std": 0.49687498807907104,
      "rewards/reward_func/mean": 0.7515624761581421,
      "rewards/reward_func/std": 0.4600290060043335,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.757878951728344,
      "epoch": 0.3359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25204285979270935,
      "learning_rate": 4.1e-06,
      "loss": -0.0,
      "num_tokens": 806624.0,
      "reward": 0.24729999899864197,
      "reward_std": 0.49459999799728394,
      "rewards/reward_func/mean": 0.24729999899864197,
      "rewards/reward_func/std": 0.4579470157623291,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5134147731587291,
      "epoch": 0.337890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.168964684009552,
      "learning_rate": 4.094736842105264e-06,
      "loss": -0.0,
      "num_tokens": 811104.0,
      "reward": 0.5767999887466431,
      "reward_std": 0.4959026575088501,
      "rewards/reward_func/mean": 0.5767999887466431,
      "rewards/reward_func/std": 0.47768643498420715,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6563267186284065,
      "epoch": 0.33984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23291705548763275,
      "learning_rate": 4.089473684210527e-06,
      "loss": 0.0,
      "num_tokens": 815760.0,
      "reward": 0.49806252121925354,
      "reward_std": 0.49673035740852356,
      "rewards/reward_func/mean": 0.49806249141693115,
      "rewards/reward_func/std": 0.5292056798934937,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6437232866883278,
      "epoch": 0.341796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24242784082889557,
      "learning_rate": 4.08421052631579e-06,
      "loss": 0.0,
      "num_tokens": 820600.0,
      "reward": 0.45920002460479736,
      "reward_std": 0.53072190284729,
      "rewards/reward_func/mean": 0.45920002460479736,
      "rewards/reward_func/std": 0.49137234687805176,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.44995045475661755,
      "epoch": 0.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20669235289096832,
      "learning_rate": 4.078947368421053e-06,
      "loss": 0.0,
      "num_tokens": 825384.0,
      "reward": 0.25687500834465027,
      "reward_std": 0.2894454598426819,
      "rewards/reward_func/mean": 0.2568749785423279,
      "rewards/reward_func/std": 0.4227794110774994,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6303725428879261,
      "epoch": 0.345703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17360170185565948,
      "learning_rate": 4.073684210526316e-06,
      "loss": -0.0,
      "num_tokens": 830176.0,
      "reward": 0.24449999630451202,
      "reward_std": 0.48899999260902405,
      "rewards/reward_func/mean": 0.24449999630451202,
      "rewards/reward_func/std": 0.4528787434101105,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5138363922014832,
      "epoch": 0.34765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23947031795978546,
      "learning_rate": 4.0684210526315795e-06,
      "loss": -0.0,
      "num_tokens": 834968.0,
      "reward": 0.476812481880188,
      "reward_std": 0.4751999080181122,
      "rewards/reward_func/mean": 0.476812481880188,
      "rewards/reward_func/std": 0.506460964679718,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8445651698857546,
      "epoch": 0.349609375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.2013005018234253,
      "learning_rate": 4.063157894736842e-06,
      "loss": -0.0,
      "num_tokens": 839816.0,
      "reward": 0.625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6270747724920511,
      "epoch": 0.3515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2170683890581131,
      "learning_rate": 4.057894736842106e-06,
      "loss": -0.0,
      "num_tokens": 844424.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.2991751432418823,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7638631723821163,
      "epoch": 0.353515625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.2188190519809723,
      "learning_rate": 4.052631578947368e-06,
      "loss": 0.0,
      "num_tokens": 848736.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6433968283236027,
      "epoch": 0.35546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21660254895687103,
      "learning_rate": 4.047368421052632e-06,
      "loss": 0.0,
      "num_tokens": 853696.0,
      "reward": 0.2412624955177307,
      "reward_std": 0.4451237916946411,
      "rewards/reward_func/mean": 0.2412625104188919,
      "rewards/reward_func/std": 0.4122578501701355,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7037541754543781,
      "epoch": 0.357421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24847397208213806,
      "learning_rate": 4.042105263157895e-06,
      "loss": 0.0,
      "num_tokens": 858184.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.4917367100715637,
      "rewards/reward_func/mean": 0.5062500238418579,
      "rewards/reward_func/std": 0.5279255509376526,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6735056638717651,
      "epoch": 0.359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24711540341377258,
      "learning_rate": 4.0368421052631585e-06,
      "loss": 0.0,
      "num_tokens": 862816.0,
      "reward": 0.49459999799728394,
      "reward_std": 0.5711829662322998,
      "rewards/reward_func/mean": 0.49459999799728394,
      "rewards/reward_func/std": 0.5288126468658447,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.38281242828816175,
      "epoch": 0.361328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20001716911792755,
      "learning_rate": 4.031578947368422e-06,
      "loss": -0.0,
      "num_tokens": 867320.0,
      "reward": 0.5843999981880188,
      "reward_std": 0.504734992980957,
      "rewards/reward_func/mean": 0.5843999981880188,
      "rewards/reward_func/std": 0.4840165376663208,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6081971619278193,
      "epoch": 0.36328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21603205800056458,
      "learning_rate": 4.026315789473684e-06,
      "loss": 0.0,
      "num_tokens": 872280.0,
      "reward": 0.2266875058412552,
      "reward_std": 0.2657185196876526,
      "rewards/reward_func/mean": 0.2266875058412552,
      "rewards/reward_func/std": 0.40837597846984863,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6896807588636875,
      "epoch": 0.365234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23351432383060455,
      "learning_rate": 4.021052631578948e-06,
      "loss": 0.0,
      "num_tokens": 876816.0,
      "reward": 0.38750001788139343,
      "reward_std": 0.2557178735733032,
      "rewards/reward_func/mean": 0.38749998807907104,
      "rewards/reward_func/std": 0.5074445605278015,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6723958812654018,
      "epoch": 0.3671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2449483573436737,
      "learning_rate": 4.01578947368421e-06,
      "loss": 0.0,
      "num_tokens": 881448.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5192831270396709,
      "epoch": 0.369140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23788705468177795,
      "learning_rate": 4.010526315789474e-06,
      "loss": 0.0,
      "num_tokens": 886288.0,
      "reward": 0.2617875039577484,
      "reward_std": 0.4782521724700928,
      "rewards/reward_func/mean": 0.2617875039577484,
      "rewards/reward_func/std": 0.4428786039352417,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5673484560102224,
      "epoch": 0.37109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2121075987815857,
      "learning_rate": 4.005263157894737e-06,
      "loss": -0.0,
      "num_tokens": 891080.0,
      "reward": 0.23899999260902405,
      "reward_std": 0.4779999852180481,
      "rewards/reward_func/mean": 0.23899999260902405,
      "rewards/reward_func/std": 0.44254201650619507,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.40696537867188454,
      "epoch": 0.373046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13059356808662415,
      "learning_rate": 4.000000000000001e-06,
      "loss": 0.0,
      "num_tokens": 895568.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.2683524191379547,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7386405281722546,
      "epoch": 0.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24719874560832977,
      "learning_rate": 3.994736842105264e-06,
      "loss": 0.0,
      "num_tokens": 900312.0,
      "reward": 0.48875001072883606,
      "reward_std": 0.4433884024620056,
      "rewards/reward_func/mean": 0.48874998092651367,
      "rewards/reward_func/std": 0.4832848906517029,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5994266867637634,
      "epoch": 0.376953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22721026837825775,
      "learning_rate": 3.989473684210526e-06,
      "loss": 0.0,
      "num_tokens": 904664.0,
      "reward": 0.6390625238418579,
      "reward_std": 0.517444908618927,
      "rewards/reward_func/mean": 0.6390625238418579,
      "rewards/reward_func/std": 0.49845463037490845,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6586835943162441,
      "epoch": 0.37890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21596045792102814,
      "learning_rate": 3.98421052631579e-06,
      "loss": 0.0,
      "num_tokens": 909648.0,
      "reward": 0.11412499845027924,
      "reward_std": 0.2256084382534027,
      "rewards/reward_func/mean": 0.11412499845027924,
      "rewards/reward_func/std": 0.3127436935901642,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7914648056030273,
      "epoch": 0.380859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23387473821640015,
      "learning_rate": 3.9789473684210525e-06,
      "loss": -0.0,
      "num_tokens": 914416.0,
      "reward": 0.6203374862670898,
      "reward_std": 0.5114390850067139,
      "rewards/reward_func/mean": 0.6203374862670898,
      "rewards/reward_func/std": 0.49055466055870056,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5869229193776846,
      "epoch": 0.3828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23709914088249207,
      "learning_rate": 3.9736842105263165e-06,
      "loss": 0.0,
      "num_tokens": 919352.0,
      "reward": 0.5044000148773193,
      "reward_std": 0.4865538477897644,
      "rewards/reward_func/mean": 0.5044000148773193,
      "rewards/reward_func/std": 0.5128971934318542,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.579900648444891,
      "epoch": 0.384765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15101991593837738,
      "learning_rate": 3.968421052631579e-06,
      "loss": 0.0,
      "num_tokens": 923976.0,
      "reward": 0.9973000288009644,
      "reward_std": 0.0054000020027160645,
      "rewards/reward_func/mean": 0.9973000288009644,
      "rewards/reward_func/std": 0.007636756170541048,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8158314451575279,
      "epoch": 0.38671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21359558403491974,
      "learning_rate": 3.963157894736843e-06,
      "loss": 0.0,
      "num_tokens": 928512.0,
      "reward": 0.12968750298023224,
      "reward_std": 0.2552257776260376,
      "rewards/reward_func/mean": 0.12968750298023224,
      "rewards/reward_func/std": 0.35177722573280334,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9157693684101105,
      "epoch": 0.388671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.254290372133255,
      "learning_rate": 3.957894736842106e-06,
      "loss": 0.0,
      "num_tokens": 933008.0,
      "reward": 0.2562499940395355,
      "reward_std": 0.49611565470695496,
      "rewards/reward_func/mean": 0.2562499940395355,
      "rewards/reward_func/std": 0.4593765139579773,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6207092180848122,
      "epoch": 0.390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18421423435211182,
      "learning_rate": 3.952631578947368e-06,
      "loss": 0.0,
      "num_tokens": 937968.0,
      "reward": 0.2251249998807907,
      "reward_std": 0.44199562072753906,
      "rewards/reward_func/mean": 0.2251249998807907,
      "rewards/reward_func/std": 0.40922626852989197,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7159414663910866,
      "epoch": 0.392578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21772877871990204,
      "learning_rate": 3.947368421052632e-06,
      "loss": 0.0,
      "num_tokens": 942312.0,
      "reward": 0.1796875149011612,
      "reward_std": 0.2510603666305542,
      "rewards/reward_func/mean": 0.1796875149011612,
      "rewards/reward_func/std": 0.33246493339538574,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6835194565355778,
      "epoch": 0.39453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22371238470077515,
      "learning_rate": 3.942105263157895e-06,
      "loss": 0.0,
      "num_tokens": 947480.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7550305463373661,
      "epoch": 0.396484375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.936842105263159e-06,
      "loss": 0.0,
      "num_tokens": 952168.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7681427337229252,
      "epoch": 0.3984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22429360449314117,
      "learning_rate": 3.931578947368421e-06,
      "loss": -0.0,
      "num_tokens": 956920.0,
      "reward": 0.4778124988079071,
      "reward_std": 0.46526336669921875,
      "rewards/reward_func/mean": 0.4778125286102295,
      "rewards/reward_func/std": 0.4945225715637207,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7594692595303059,
      "epoch": 0.400390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24203604459762573,
      "learning_rate": 3.926315789473685e-06,
      "loss": 0.0,
      "num_tokens": 961672.0,
      "reward": 0.4778124988079071,
      "reward_std": 0.46526336669921875,
      "rewards/reward_func/mean": 0.4778125286102295,
      "rewards/reward_func/std": 0.4945225715637207,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7869609482586384,
      "epoch": 0.40234375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.921052631578947e-06,
      "loss": 0.0,
      "num_tokens": 966320.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4896150715649128,
      "epoch": 0.404296875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12556888163089752,
      "learning_rate": 3.9157894736842104e-06,
      "loss": -0.0,
      "num_tokens": 971120.0,
      "reward": 0.11949999630451202,
      "reward_std": 0.23899999260902405,
      "rewards/reward_func/mean": 0.11949999630451202,
      "rewards/reward_func/std": 0.33799704909324646,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5013071876019239,
      "epoch": 0.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20768725872039795,
      "learning_rate": 3.9105263157894744e-06,
      "loss": 0.0,
      "num_tokens": 975976.0,
      "reward": 0.3636625111103058,
      "reward_std": 0.5205842852592468,
      "rewards/reward_func/mean": 0.3636625111103058,
      "rewards/reward_func/std": 0.49861037731170654,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7933114245533943,
      "epoch": 0.408203125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.905263157894737e-06,
      "loss": 0.0,
      "num_tokens": 980408.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6895494759082794,
      "epoch": 0.41015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23387707769870758,
      "learning_rate": 3.900000000000001e-06,
      "loss": -0.0,
      "num_tokens": 985104.0,
      "reward": 0.600100040435791,
      "reward_std": 0.5192785263061523,
      "rewards/reward_func/mean": 0.6000999808311462,
      "rewards/reward_func/std": 0.4977251887321472,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7427148595452309,
      "epoch": 0.412109375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16066974401474,
      "learning_rate": 3.894736842105263e-06,
      "loss": 0.0,
      "num_tokens": 989448.0,
      "reward": 0.885937511920929,
      "reward_std": 0.22812499105930328,
      "rewards/reward_func/mean": 0.885937511920929,
      "rewards/reward_func/std": 0.32261747121810913,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7897635065019131,
      "epoch": 0.4140625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17688219249248505,
      "learning_rate": 3.889473684210527e-06,
      "loss": 0.0,
      "num_tokens": 994096.0,
      "reward": 0.12386249750852585,
      "reward_std": 0.24357615411281586,
      "rewards/reward_func/mean": 0.12386249750852585,
      "rewards/reward_func/std": 0.3453129827976227,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49311958625912666,
      "epoch": 0.416015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17744576930999756,
      "learning_rate": 3.884210526315789e-06,
      "loss": 0.0,
      "num_tokens": 998744.0,
      "reward": 0.2519875168800354,
      "reward_std": 0.491636723279953,
      "rewards/reward_func/mean": 0.251987487077713,
      "rewards/reward_func/std": 0.4552379846572876,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7735568955540657,
      "epoch": 0.41796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2734695374965668,
      "learning_rate": 3.878947368421053e-06,
      "loss": -0.0,
      "num_tokens": 1003264.0,
      "reward": 0.37542498111724854,
      "reward_std": 0.5312633514404297,
      "rewards/reward_func/mean": 0.37542498111724854,
      "rewards/reward_func/std": 0.5113483667373657,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6551911272108555,
      "epoch": 0.419921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21010510623455048,
      "learning_rate": 3.873684210526316e-06,
      "loss": 0.0,
      "num_tokens": 1008120.0,
      "reward": 0.6193125247955322,
      "reward_std": 0.5183161497116089,
      "rewards/reward_func/mean": 0.6193125247955322,
      "rewards/reward_func/std": 0.49596303701400757,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6946419551968575,
      "epoch": 0.421875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18316613137722015,
      "learning_rate": 3.868421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1012760.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.594407208263874,
      "epoch": 0.423828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20372089743614197,
      "learning_rate": 3.863157894736843e-06,
      "loss": 0.0,
      "num_tokens": 1017496.0,
      "reward": 0.4871875047683716,
      "reward_std": 0.5233246684074402,
      "rewards/reward_func/mean": 0.4871875047683716,
      "rewards/reward_func/std": 0.48450902104377747,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4554303726181388,
      "epoch": 0.42578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20890522003173828,
      "learning_rate": 3.857894736842105e-06,
      "loss": 0.0,
      "num_tokens": 1021992.0,
      "reward": 0.5825625061988831,
      "reward_std": 0.49896040558815,
      "rewards/reward_func/mean": 0.5825625061988831,
      "rewards/reward_func/std": 0.47897282242774963,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9218173250555992,
      "epoch": 0.427734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22832128405570984,
      "learning_rate": 3.852631578947369e-06,
      "loss": 0.0,
      "num_tokens": 1026488.0,
      "reward": 0.629687488079071,
      "reward_std": 0.5293000936508179,
      "rewards/reward_func/mean": 0.629687488079071,
      "rewards/reward_func/std": 0.5112107992172241,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6961621716618538,
      "epoch": 0.4296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2567530870437622,
      "learning_rate": 3.8473684210526316e-06,
      "loss": -0.0,
      "num_tokens": 1031136.0,
      "reward": 0.4947499930858612,
      "reward_std": 0.48889586329460144,
      "rewards/reward_func/mean": 0.4947499930858612,
      "rewards/reward_func/std": 0.5162643790245056,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49037862569093704,
      "epoch": 0.431640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21043436229228973,
      "learning_rate": 3.842105263157895e-06,
      "loss": -0.0,
      "num_tokens": 1035784.0,
      "reward": 0.6168999671936035,
      "reward_std": 0.5137720108032227,
      "rewards/reward_func/mean": 0.6168999671936035,
      "rewards/reward_func/std": 0.49751195311546326,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5241647306829691,
      "epoch": 0.43359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.189620703458786,
      "learning_rate": 3.836842105263158e-06,
      "loss": -0.0,
      "num_tokens": 1040752.0,
      "reward": 0.12037500739097595,
      "reward_std": 0.2284284085035324,
      "rewards/reward_func/mean": 0.12037499994039536,
      "rewards/reward_func/std": 0.31062963604927063,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7036157362163067,
      "epoch": 0.435546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25706610083580017,
      "learning_rate": 3.831578947368421e-06,
      "loss": -0.0,
      "num_tokens": 1045768.0,
      "reward": 0.5924999713897705,
      "reward_std": 0.5081254839897156,
      "rewards/reward_func/mean": 0.5924999713897705,
      "rewards/reward_func/std": 0.49127423763275146,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6691739205271006,
      "epoch": 0.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21749237179756165,
      "learning_rate": 3.826315789473685e-06,
      "loss": -0.0,
      "num_tokens": 1050600.0,
      "reward": 0.6239999532699585,
      "reward_std": 0.5126060247421265,
      "rewards/reward_func/mean": 0.6239999532699585,
      "rewards/reward_func/std": 0.48936182260513306,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6261179707944393,
      "epoch": 0.439453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20746906101703644,
      "learning_rate": 3.821052631578947e-06,
      "loss": 0.0,
      "num_tokens": 1054944.0,
      "reward": 0.29374998807907104,
      "reward_std": 0.4726366698741913,
      "rewards/reward_func/mean": 0.29375001788139343,
      "rewards/reward_func/std": 0.4375765025615692,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5851390026509762,
      "epoch": 0.44140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23050355911254883,
      "learning_rate": 3.815789473684211e-06,
      "loss": 0.0,
      "num_tokens": 1059600.0,
      "reward": 0.6195999979972839,
      "reward_std": 0.5338436961174011,
      "rewards/reward_func/mean": 0.6195999979972839,
      "rewards/reward_func/std": 0.513155460357666,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.0195250883698463,
      "epoch": 0.443359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2704000473022461,
      "learning_rate": 3.810526315789474e-06,
      "loss": 0.0,
      "num_tokens": 1064080.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.2940751314163208,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8138170130550861,
      "epoch": 0.4453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24457702040672302,
      "learning_rate": 3.805263157894737e-06,
      "loss": -0.0,
      "num_tokens": 1069096.0,
      "reward": 0.3586999773979187,
      "reward_std": 0.5152043104171753,
      "rewards/reward_func/mean": 0.3586999773979187,
      "rewards/reward_func/std": 0.49505308270454407,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7082953453063965,
      "epoch": 0.447265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22110335528850555,
      "learning_rate": 3.8000000000000005e-06,
      "loss": -0.0,
      "num_tokens": 1073736.0,
      "reward": 0.3614499866962433,
      "reward_std": 0.5230631828308105,
      "rewards/reward_func/mean": 0.3614499866962433,
      "rewards/reward_func/std": 0.49916091561317444,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.30846840143203735,
      "epoch": 0.44921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1866290271282196,
      "learning_rate": 3.794736842105263e-06,
      "loss": 0.0,
      "num_tokens": 1078248.0,
      "reward": 0.46480000019073486,
      "reward_std": 0.5367048382759094,
      "rewards/reward_func/mean": 0.46480000019073486,
      "rewards/reward_func/std": 0.49689212441444397,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9043601844459772,
      "epoch": 0.451171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2427355796098709,
      "learning_rate": 3.789473684210527e-06,
      "loss": -0.0,
      "num_tokens": 1082592.0,
      "reward": 0.5265624523162842,
      "reward_std": 0.5467819571495056,
      "rewards/reward_func/mean": 0.526562511920929,
      "rewards/reward_func/std": 0.506891667842865,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5939224380999804,
      "epoch": 0.453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23449236154556274,
      "learning_rate": 3.7842105263157895e-06,
      "loss": 0.0,
      "num_tokens": 1087440.0,
      "reward": 0.3668999969959259,
      "reward_std": 0.5270397663116455,
      "rewards/reward_func/mean": 0.3668999969959259,
      "rewards/reward_func/std": 0.5063701272010803,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7096200175583363,
      "epoch": 0.455078125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1598760038614273,
      "learning_rate": 3.778947368421053e-06,
      "loss": -0.0,
      "num_tokens": 1092192.0,
      "reward": 0.819100022315979,
      "reward_std": 0.23282161355018616,
      "rewards/reward_func/mean": 0.819100022315979,
      "rewards/reward_func/std": 0.33110320568084717,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7353874929249287,
      "epoch": 0.45703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17731121182441711,
      "learning_rate": 3.773684210526316e-06,
      "loss": 0.0,
      "num_tokens": 1096536.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8204295337200165,
      "epoch": 0.458984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19422033429145813,
      "learning_rate": 3.768421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1101328.0,
      "reward": 0.7335000038146973,
      "reward_std": 0.45602166652679443,
      "rewards/reward_func/mean": 0.7335000038146973,
      "rewards/reward_func/std": 0.4227708876132965,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6538256984204054,
      "epoch": 0.4609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.35198187828063965,
      "learning_rate": 3.7631578947368426e-06,
      "loss": 0.0,
      "num_tokens": 1105976.0,
      "reward": 0.4843499958515167,
      "reward_std": 0.4797285199165344,
      "rewards/reward_func/mean": 0.4843499958515167,
      "rewards/reward_func/std": 0.5180744528770447,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.37019990384578705,
      "epoch": 0.462890625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.7578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 1110480.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6362588629126549,
      "epoch": 0.46484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.190359964966774,
      "learning_rate": 3.752631578947369e-06,
      "loss": 0.0,
      "num_tokens": 1115256.0,
      "reward": 0.249937504529953,
      "reward_std": 0.47089433670043945,
      "rewards/reward_func/mean": 0.2499374896287918,
      "rewards/reward_func/std": 0.436068594455719,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.914298091083765,
      "epoch": 0.466796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2647916376590729,
      "learning_rate": 3.7473684210526317e-06,
      "loss": 0.0,
      "num_tokens": 1119752.0,
      "reward": 0.7519875168800354,
      "reward_std": 0.4888792634010315,
      "rewards/reward_func/mean": 0.7519875168800354,
      "rewards/reward_func/std": 0.45273634791374207,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.516225041821599,
      "epoch": 0.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2023496925830841,
      "learning_rate": 3.7421052631578953e-06,
      "loss": -0.0,
      "num_tokens": 1124736.0,
      "reward": 0.11806250363588333,
      "reward_std": 0.21600480377674103,
      "rewards/reward_func/mean": 0.11806250363588333,
      "rewards/reward_func/std": 0.28930220007896423,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6972608156502247,
      "epoch": 0.470703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22246108949184418,
      "learning_rate": 3.736842105263158e-06,
      "loss": 0.0,
      "num_tokens": 1129560.0,
      "reward": 0.2282000035047531,
      "reward_std": 0.4564000070095062,
      "rewards/reward_func/mean": 0.2282000035047531,
      "rewards/reward_func/std": 0.42270201444625854,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7819927409291267,
      "epoch": 0.47265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21443378925323486,
      "learning_rate": 3.7315789473684216e-06,
      "loss": 0.0,
      "num_tokens": 1134352.0,
      "reward": 0.36834999918937683,
      "reward_std": 0.5177301168441772,
      "rewards/reward_func/mean": 0.36834999918937683,
      "rewards/reward_func/std": 0.49500060081481934,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5241367612034082,
      "epoch": 0.474609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24711766839027405,
      "learning_rate": 3.7263157894736848e-06,
      "loss": 0.0,
      "num_tokens": 1139000.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9131961166858673,
      "epoch": 0.4765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3064594864845276,
      "learning_rate": 3.7210526315789475e-06,
      "loss": -0.0,
      "num_tokens": 1143488.0,
      "reward": 0.8697500228881836,
      "reward_std": 0.2605000138282776,
      "rewards/reward_func/mean": 0.8697500228881836,
      "rewards/reward_func/std": 0.3517392575740814,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.766132302582264,
      "epoch": 0.478515625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.715789473684211e-06,
      "loss": 0.0,
      "num_tokens": 1148352.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6391206979751587,
      "epoch": 0.48046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24102520942687988,
      "learning_rate": 3.710526315789474e-06,
      "loss": -0.0,
      "num_tokens": 1153104.0,
      "reward": 0.819100022315979,
      "reward_std": 0.241799995303154,
      "rewards/reward_func/mean": 0.819100022315979,
      "rewards/reward_func/std": 0.33110323548316956,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5933511331677437,
      "epoch": 0.482421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22239266335964203,
      "learning_rate": 3.7052631578947374e-06,
      "loss": 0.0,
      "num_tokens": 1158072.0,
      "reward": 0.36543750762939453,
      "reward_std": 0.2436250001192093,
      "rewards/reward_func/mean": 0.36543750762939453,
      "rewards/reward_func/std": 0.4533359408378601,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47794661670923233,
      "epoch": 0.484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21475116908550262,
      "learning_rate": 3.7e-06,
      "loss": 0.0,
      "num_tokens": 1162872.0,
      "reward": 0.5974999666213989,
      "reward_std": 0.5149734020233154,
      "rewards/reward_func/mean": 0.5974999666213989,
      "rewards/reward_func/std": 0.49477699398994446,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9037050679326057,
      "epoch": 0.486328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2283296287059784,
      "learning_rate": 3.6947368421052637e-06,
      "loss": -0.0,
      "num_tokens": 1167368.0,
      "reward": 0.3828125,
      "reward_std": 0.529944896697998,
      "rewards/reward_func/mean": 0.3828125,
      "rewards/reward_func/std": 0.5115163922309875,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7908897437155247,
      "epoch": 0.48828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21019713580608368,
      "learning_rate": 3.6894736842105265e-06,
      "loss": 0.0,
      "num_tokens": 1172112.0,
      "reward": 0.8685125112533569,
      "reward_std": 0.2369532436132431,
      "rewards/reward_func/mean": 0.8685125112533569,
      "rewards/reward_func/std": 0.32584938406944275,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6358069218695164,
      "epoch": 0.490234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2785353660583496,
      "learning_rate": 3.6842105263157896e-06,
      "loss": 0.0,
      "num_tokens": 1176896.0,
      "reward": 0.6227250099182129,
      "reward_std": 0.5166673064231873,
      "rewards/reward_func/mean": 0.6227250099182129,
      "rewards/reward_func/std": 0.49570804834365845,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.38082710839807987,
      "epoch": 0.4921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20742739737033844,
      "learning_rate": 3.6789473684210532e-06,
      "loss": -0.0,
      "num_tokens": 1181544.0,
      "reward": 0.4947499930858612,
      "reward_std": 0.571418046951294,
      "rewards/reward_func/mean": 0.4947499930858612,
      "rewards/reward_func/std": 0.5290886163711548,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4866997255012393,
      "epoch": 0.494140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17461475729942322,
      "learning_rate": 3.673684210526316e-06,
      "loss": 0.0,
      "num_tokens": 1186056.0,
      "reward": 0.6909000277519226,
      "reward_std": 0.46066808700561523,
      "rewards/reward_func/mean": 0.6909000277519226,
      "rewards/reward_func/std": 0.42650365829467773,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6999755539000034,
      "epoch": 0.49609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22714459896087646,
      "learning_rate": 3.6684210526315796e-06,
      "loss": 0.0,
      "num_tokens": 1191056.0,
      "reward": 0.34780001640319824,
      "reward_std": 0.5009496808052063,
      "rewards/reward_func/mean": 0.34780001640319824,
      "rewards/reward_func/std": 0.4800656735897064,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5954524613916874,
      "epoch": 0.498046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2830187678337097,
      "learning_rate": 3.6631578947368423e-06,
      "loss": 0.0,
      "num_tokens": 1195904.0,
      "reward": 0.34780001640319824,
      "reward_std": 0.5050222873687744,
      "rewards/reward_func/mean": 0.34780001640319824,
      "rewards/reward_func/std": 0.4806229770183563,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.622910525649786,
      "epoch": 0.5,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1351393461227417,
      "learning_rate": 3.657894736842106e-06,
      "loss": 0.0,
      "num_tokens": 1200880.0,
      "reward": 0.3295000195503235,
      "reward_std": 0.2197657823562622,
      "rewards/reward_func/mean": 0.3295000195503235,
      "rewards/reward_func/std": 0.45483532547950745,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6488723643124104,
      "epoch": 0.501953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21160662174224854,
      "learning_rate": 3.6526315789473686e-06,
      "loss": -0.0,
      "num_tokens": 1205728.0,
      "reward": 0.267612487077713,
      "reward_std": 0.48172545433044434,
      "rewards/reward_func/mean": 0.267612487077713,
      "rewards/reward_func/std": 0.4461328089237213,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9895205125212669,
      "epoch": 0.50390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18444561958312988,
      "learning_rate": 3.6473684210526318e-06,
      "loss": 0.0,
      "num_tokens": 1210304.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8820672258734703,
      "epoch": 0.505859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24869714677333832,
      "learning_rate": 3.642105263157895e-06,
      "loss": -0.0,
      "num_tokens": 1215008.0,
      "reward": 0.9872499704360962,
      "reward_std": 0.025499999523162842,
      "rewards/reward_func/mean": 0.9872499704360962,
      "rewards/reward_func/std": 0.024093568325042725,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6377357617020607,
      "epoch": 0.5078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22434614598751068,
      "learning_rate": 3.636842105263158e-06,
      "loss": 0.0,
      "num_tokens": 1219872.0,
      "reward": 0.6390625238418579,
      "reward_std": 0.518474817276001,
      "rewards/reward_func/mean": 0.6390625238418579,
      "rewards/reward_func/std": 0.4981410801410675,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8828465715050697,
      "epoch": 0.509765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2496601790189743,
      "learning_rate": 3.6315789473684217e-06,
      "loss": 0.0,
      "num_tokens": 1224624.0,
      "reward": 0.4991750121116638,
      "reward_std": 0.5548579692840576,
      "rewards/reward_func/mean": 0.4991750121116638,
      "rewards/reward_func/std": 0.5141252875328064,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9650222808122635,
      "epoch": 0.51171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2566354274749756,
      "learning_rate": 3.6263157894736844e-06,
      "loss": 0.0,
      "num_tokens": 1229096.0,
      "reward": 0.37812501192092896,
      "reward_std": 0.5366618037223816,
      "rewards/reward_func/mean": 0.37812501192092896,
      "rewards/reward_func/std": 0.5150308012962341,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.965212844312191,
      "epoch": 0.513671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1699792593717575,
      "learning_rate": 3.621052631578948e-06,
      "loss": -0.0,
      "num_tokens": 1233576.0,
      "reward": 0.754687488079071,
      "reward_std": 0.2833658754825592,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6129315309226513,
      "epoch": 0.515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19248241186141968,
      "learning_rate": 3.6157894736842108e-06,
      "loss": 0.0,
      "num_tokens": 1238552.0,
      "reward": 0.22981250286102295,
      "reward_std": 0.43893176317214966,
      "rewards/reward_func/mean": 0.22981250286102295,
      "rewards/reward_func/std": 0.40648478269577026,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7329412698745728,
      "epoch": 0.517578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20942990481853485,
      "learning_rate": 3.610526315789474e-06,
      "loss": -0.0,
      "num_tokens": 1243528.0,
      "reward": 0.015625,
      "reward_std": 0.02392766997218132,
      "rewards/reward_func/mean": 0.015625,
      "rewards/reward_func/std": 0.0265165064483881,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5853278152644634,
      "epoch": 0.51953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19774886965751648,
      "learning_rate": 3.605263157894737e-06,
      "loss": 0.0,
      "num_tokens": 1248192.0,
      "reward": 0.49303752183914185,
      "reward_std": 0.5586864948272705,
      "rewards/reward_func/mean": 0.49303752183914185,
      "rewards/reward_func/std": 0.5173379778862,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7437873594462872,
      "epoch": 0.521484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18486569821834564,
      "learning_rate": 3.6000000000000003e-06,
      "loss": 0.0,
      "num_tokens": 1253032.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6368556562811136,
      "epoch": 0.5234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21867771446704865,
      "learning_rate": 3.5947368421052634e-06,
      "loss": 0.0,
      "num_tokens": 1258256.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.49611562490463257,
      "rewards/reward_func/mean": 0.5062500238418579,
      "rewards/reward_func/std": 0.5280946493148804,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6667620614171028,
      "epoch": 0.525390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16187067329883575,
      "learning_rate": 3.5894736842105266e-06,
      "loss": 0.0,
      "num_tokens": 1263120.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5582370385527611,
      "epoch": 0.52734375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16511990129947662,
      "learning_rate": 3.58421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1267480.0,
      "reward": 0.8843749761581421,
      "reward_std": 0.23125000298023224,
      "rewards/reward_func/mean": 0.8843749761581421,
      "rewards/reward_func/std": 0.32703688740730286,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49717821925878525,
      "epoch": 0.529296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19459761679172516,
      "learning_rate": 3.578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 1272136.0,
      "reward": 0.6297374963760376,
      "reward_std": 0.5136181712150574,
      "rewards/reward_func/mean": 0.6297374963760376,
      "rewards/reward_func/std": 0.49753350019454956,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47147443797439337,
      "epoch": 0.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17289377748966217,
      "learning_rate": 3.5736842105263157e-06,
      "loss": -0.0,
      "num_tokens": 1276648.0,
      "reward": 0.6951000094413757,
      "reward_std": 0.4634339511394501,
      "rewards/reward_func/mean": 0.6951000094413757,
      "rewards/reward_func/std": 0.42906418442726135,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6894197128713131,
      "epoch": 0.533203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24948523938655853,
      "learning_rate": 3.5684210526315792e-06,
      "loss": -0.0,
      "num_tokens": 1281440.0,
      "reward": 0.7309999465942383,
      "reward_std": 0.2882004380226135,
      "rewards/reward_func/mean": 0.7309999465942383,
      "rewards/reward_func/std": 0.4513968229293823,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8942212909460068,
      "epoch": 0.53515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9948049783706665,
      "learning_rate": 3.5631578947368424e-06,
      "loss": 0.0,
      "num_tokens": 1286176.0,
      "reward": 0.7289999723434448,
      "reward_std": 0.48616960644721985,
      "rewards/reward_func/mean": 0.7289999723434448,
      "rewards/reward_func/std": 0.45014360547065735,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8072192035615444,
      "epoch": 0.537109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5503926873207092,
      "learning_rate": 3.5578947368421056e-06,
      "loss": 0.0,
      "num_tokens": 1290680.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6735758259892464,
      "epoch": 0.5390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20834791660308838,
      "learning_rate": 3.5526315789473687e-06,
      "loss": 0.0,
      "num_tokens": 1295328.0,
      "reward": 0.4961625039577484,
      "reward_std": 0.4955860376358032,
      "rewards/reward_func/mean": 0.4961625039577484,
      "rewards/reward_func/std": 0.5272848010063171,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6799687743186951,
      "epoch": 0.541015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26546961069107056,
      "learning_rate": 3.5473684210526323e-06,
      "loss": 0.0,
      "num_tokens": 1300344.0,
      "reward": 0.23640000820159912,
      "reward_std": 0.47280001640319824,
      "rewards/reward_func/mean": 0.23640000820159912,
      "rewards/reward_func/std": 0.43869248032569885,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8527603708207607,
      "epoch": 0.54296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.239075168967247,
      "learning_rate": 3.542105263157895e-06,
      "loss": 0.0,
      "num_tokens": 1304872.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47294606268405914,
      "epoch": 0.544921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.208218514919281,
      "learning_rate": 3.536842105263158e-06,
      "loss": -0.0,
      "num_tokens": 1309520.0,
      "reward": 0.3731499910354614,
      "reward_std": 0.5254724621772766,
      "rewards/reward_func/mean": 0.3731499910354614,
      "rewards/reward_func/std": 0.5016124248504639,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6301430575549603,
      "epoch": 0.546875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1511426568031311,
      "learning_rate": 3.5315789473684214e-06,
      "loss": 0.0,
      "num_tokens": 1314368.0,
      "reward": 0.22830000519752502,
      "reward_std": 0.26377108693122864,
      "rewards/reward_func/mean": 0.22830000519752502,
      "rewards/reward_func/std": 0.4228929877281189,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6302942372858524,
      "epoch": 0.548828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2503369152545929,
      "learning_rate": 3.5263157894736846e-06,
      "loss": 0.0,
      "num_tokens": 1319120.0,
      "reward": 0.6139749884605408,
      "reward_std": 0.4990615248680115,
      "rewards/reward_func/mean": 0.6139749884605408,
      "rewards/reward_func/std": 0.48867690563201904,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5613374076783657,
      "epoch": 0.55078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21636393666267395,
      "learning_rate": 3.5210526315789477e-06,
      "loss": 0.0,
      "num_tokens": 1323984.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5965826436877251,
      "epoch": 0.552734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22059328854084015,
      "learning_rate": 3.515789473684211e-06,
      "loss": -0.0,
      "num_tokens": 1328328.0,
      "reward": 0.7463124990463257,
      "reward_std": 0.29738226532936096,
      "rewards/reward_func/mean": 0.7463124990463257,
      "rewards/reward_func/std": 0.45701852440834045,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6045653857290745,
      "epoch": 0.5546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2106325626373291,
      "learning_rate": 3.510526315789474e-06,
      "loss": 0.0,
      "num_tokens": 1332984.0,
      "reward": 0.48653751611709595,
      "reward_std": 0.48146504163742065,
      "rewards/reward_func/mean": 0.48653751611709595,
      "rewards/reward_func/std": 0.5103995203971863,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7015610095113516,
      "epoch": 0.556640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16994379460811615,
      "learning_rate": 3.505263157894737e-06,
      "loss": 0.0,
      "num_tokens": 1337928.0,
      "reward": 0.12349999696016312,
      "reward_std": 0.23874559998512268,
      "rewards/reward_func/mean": 0.12350000441074371,
      "rewards/reward_func/std": 0.31001752614974976,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6296170633286238,
      "epoch": 0.55859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.216505229473114,
      "learning_rate": 3.5e-06,
      "loss": 0.0,
      "num_tokens": 1342712.0,
      "reward": 0.4699999988079071,
      "reward_std": 0.542709231376648,
      "rewards/reward_func/mean": 0.4699999988079071,
      "rewards/reward_func/std": 0.5024511218070984,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6994357369840145,
      "epoch": 0.560546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2235638052225113,
      "learning_rate": 3.4947368421052635e-06,
      "loss": 0.0,
      "num_tokens": 1347360.0,
      "reward": 0.4905624985694885,
      "reward_std": 0.5631812810897827,
      "rewards/reward_func/mean": 0.4905625283718109,
      "rewards/reward_func/std": 0.5214440822601318,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7445014677941799,
      "epoch": 0.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21883165836334229,
      "learning_rate": 3.4894736842105263e-06,
      "loss": -0.0,
      "num_tokens": 1352008.0,
      "reward": 0.8697500228881836,
      "reward_std": 0.2605000138282776,
      "rewards/reward_func/mean": 0.8697500228881836,
      "rewards/reward_func/std": 0.3517392575740814,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48259856551885605,
      "epoch": 0.564453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20782361924648285,
      "learning_rate": 3.48421052631579e-06,
      "loss": -0.0,
      "num_tokens": 1356928.0,
      "reward": 0.3723375201225281,
      "reward_std": 0.5098629593849182,
      "rewards/reward_func/mean": 0.3723375201225281,
      "rewards/reward_func/std": 0.4901200532913208,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6990231722593307,
      "epoch": 0.56640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22484789788722992,
      "learning_rate": 3.478947368421053e-06,
      "loss": 0.0,
      "num_tokens": 1361680.0,
      "reward": 0.12812499701976776,
      "reward_std": 0.2562499940395355,
      "rewards/reward_func/mean": 0.12812499701976776,
      "rewards/reward_func/std": 0.3523993194103241,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5128569137305021,
      "epoch": 0.568359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14201033115386963,
      "learning_rate": 3.473684210526316e-06,
      "loss": -0.0,
      "num_tokens": 1366648.0,
      "reward": 0.11256249994039536,
      "reward_std": 0.22097797691822052,
      "rewards/reward_func/mean": 0.11256249994039536,
      "rewards/reward_func/std": 0.31335464119911194,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7180933952331543,
      "epoch": 0.5703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.2024068832397461,
      "learning_rate": 3.4684210526315794e-06,
      "loss": 0.0,
      "num_tokens": 1371400.0,
      "reward": 0.8334375023841858,
      "reward_std": 0.21312500536441803,
      "rewards/reward_func/mean": 0.8334375023841858,
      "rewards/reward_func/std": 0.30140429735183716,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5293250977993011,
      "epoch": 0.572265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24872228503227234,
      "learning_rate": 3.463157894736842e-06,
      "loss": 0.0,
      "num_tokens": 1376624.0,
      "reward": 0.5025625228881836,
      "reward_std": 0.4810871481895447,
      "rewards/reward_func/mean": 0.5025625228881836,
      "rewards/reward_func/std": 0.5211412906646729,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5247739069163799,
      "epoch": 0.57421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25173091888427734,
      "learning_rate": 3.4578947368421057e-06,
      "loss": 0.0,
      "num_tokens": 1381472.0,
      "reward": 0.5079500079154968,
      "reward_std": 0.5440911054611206,
      "rewards/reward_func/mean": 0.5079500079154968,
      "rewards/reward_func/std": 0.5037304759025574,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.59282411262393,
      "epoch": 0.576171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21082493662834167,
      "learning_rate": 3.4526315789473684e-06,
      "loss": -0.0,
      "num_tokens": 1386312.0,
      "reward": 0.6282625198364258,
      "reward_std": 0.5091252326965332,
      "rewards/reward_func/mean": 0.6282625198364258,
      "rewards/reward_func/std": 0.489662766456604,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6454604081809521,
      "epoch": 0.578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23169437050819397,
      "learning_rate": 3.447368421052632e-06,
      "loss": 0.0,
      "num_tokens": 1390640.0,
      "reward": 0.8723000288009644,
      "reward_std": 0.25540000200271606,
      "rewards/reward_func/mean": 0.8723000288009644,
      "rewards/reward_func/std": 0.3525434732437134,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.521615531295538,
      "epoch": 0.580078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23611055314540863,
      "learning_rate": 3.4421052631578947e-06,
      "loss": 0.0,
      "num_tokens": 1395296.0,
      "reward": 0.24806250631809235,
      "reward_std": 0.4919757843017578,
      "rewards/reward_func/mean": 0.24806249141693115,
      "rewards/reward_func/std": 0.4555468261241913,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7950318790972233,
      "epoch": 0.58203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30078282952308655,
      "learning_rate": 3.4368421052631583e-06,
      "loss": -0.0,
      "num_tokens": 1399824.0,
      "reward": 0.5103750228881836,
      "reward_std": 0.5537556409835815,
      "rewards/reward_func/mean": 0.5103750228881836,
      "rewards/reward_func/std": 0.5127608180046082,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5492544081062078,
      "epoch": 0.583984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2188585102558136,
      "learning_rate": 3.4315789473684215e-06,
      "loss": 0.0,
      "num_tokens": 1404600.0,
      "reward": 0.25306248664855957,
      "reward_std": 0.2876826226711273,
      "rewards/reward_func/mean": 0.25306248664855957,
      "rewards/reward_func/std": 0.4343123733997345,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9911265224218369,
      "epoch": 0.5859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27803367376327515,
      "learning_rate": 3.4263157894736842e-06,
      "loss": 0.0,
      "num_tokens": 1409072.0,
      "reward": 0.6269875168800354,
      "reward_std": 0.5301945805549622,
      "rewards/reward_func/mean": 0.6269875168800354,
      "rewards/reward_func/std": 0.5088964104652405,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5954888928681612,
      "epoch": 0.587890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20479944348335266,
      "learning_rate": 3.421052631578948e-06,
      "loss": -0.0,
      "num_tokens": 1414032.0,
      "reward": 0.23137500882148743,
      "reward_std": 0.43826958537101746,
      "rewards/reward_func/mean": 0.23137500882148743,
      "rewards/reward_func/std": 0.40582817792892456,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.521581256762147,
      "epoch": 0.58984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2192363440990448,
      "learning_rate": 3.4157894736842106e-06,
      "loss": 0.0,
      "num_tokens": 1418824.0,
      "reward": 0.36943748593330383,
      "reward_std": 0.5057978630065918,
      "rewards/reward_func/mean": 0.36943748593330383,
      "rewards/reward_func/std": 0.48606905341148376,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.41881873086094856,
      "epoch": 0.591796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19346477091312408,
      "learning_rate": 3.410526315789474e-06,
      "loss": 0.0,
      "num_tokens": 1423616.0,
      "reward": 0.47993749380111694,
      "reward_std": 0.4731535315513611,
      "rewards/reward_func/mean": 0.47993749380111694,
      "rewards/reward_func/std": 0.5031650066375732,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48736816458404064,
      "epoch": 0.59375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1667778193950653,
      "learning_rate": 3.405263157894737e-06,
      "loss": 0.0,
      "num_tokens": 1428288.0,
      "reward": 0.6197500228881836,
      "reward_std": 0.2395000010728836,
      "rewards/reward_func/mean": 0.6197500228881836,
      "rewards/reward_func/std": 0.5133981704711914,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8001267649233341,
      "epoch": 0.595703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1438862383365631,
      "learning_rate": 3.4000000000000005e-06,
      "loss": -0.0,
      "num_tokens": 1432912.0,
      "reward": 0.8511874675750732,
      "reward_std": 0.23108144104480743,
      "rewards/reward_func/mean": 0.8511874675750732,
      "rewards/reward_func/std": 0.3289700150489807,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7933619171380997,
      "epoch": 0.59765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22013674676418304,
      "learning_rate": 3.3947368421052636e-06,
      "loss": 0.0,
      "num_tokens": 1437576.0,
      "reward": 0.37968748807907104,
      "reward_std": 0.5357083082199097,
      "rewards/reward_func/mean": 0.37968748807907104,
      "rewards/reward_func/std": 0.5138239860534668,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9326525256037712,
      "epoch": 0.599609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2728516757488251,
      "learning_rate": 3.3894736842105264e-06,
      "loss": -0.0,
      "num_tokens": 1442096.0,
      "reward": 0.511787474155426,
      "reward_std": 0.009360386058688164,
      "rewards/reward_func/mean": 0.5117875337600708,
      "rewards/reward_func/std": 0.5104570984840393,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.867360845208168,
      "epoch": 0.6015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2490600049495697,
      "learning_rate": 3.38421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1446840.0,
      "reward": 0.49105000495910645,
      "reward_std": 0.4941999316215515,
      "rewards/reward_func/mean": 0.49105000495910645,
      "rewards/reward_func/std": 0.5251454710960388,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4640714302659035,
      "epoch": 0.603515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20176871120929718,
      "learning_rate": 3.3789473684210527e-06,
      "loss": -0.0,
      "num_tokens": 1451328.0,
      "reward": 0.5872499942779541,
      "reward_std": 0.4882524013519287,
      "rewards/reward_func/mean": 0.5872499942779541,
      "rewards/reward_func/std": 0.4727397859096527,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4609074145555496,
      "epoch": 0.60546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19601713120937347,
      "learning_rate": 3.3736842105263163e-06,
      "loss": 0.0,
      "num_tokens": 1456128.0,
      "reward": 0.4829750061035156,
      "reward_std": 0.4673447012901306,
      "rewards/reward_func/mean": 0.4829750061035156,
      "rewards/reward_func/std": 0.4967931807041168,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7879729568958282,
      "epoch": 0.607421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2218167930841446,
      "learning_rate": 3.368421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1460920.0,
      "reward": 0.9754500389099121,
      "reward_std": 0.02152196317911148,
      "rewards/reward_func/mean": 0.9754499793052673,
      "rewards/reward_func/std": 0.020223822444677353,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4728757180273533,
      "epoch": 0.609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21725128591060638,
      "learning_rate": 3.3631578947368426e-06,
      "loss": -0.0,
      "num_tokens": 1465560.0,
      "reward": 0.37002500891685486,
      "reward_std": 0.25100889801979065,
      "rewards/reward_func/mean": 0.37002497911453247,
      "rewards/reward_func/std": 0.5039855241775513,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5092521663755178,
      "epoch": 0.611328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19549833238124847,
      "learning_rate": 3.3578947368421054e-06,
      "loss": 0.0,
      "num_tokens": 1470360.0,
      "reward": 0.47251251339912415,
      "reward_std": 0.5420471429824829,
      "rewards/reward_func/mean": 0.47251251339912415,
      "rewards/reward_func/std": 0.5018724799156189,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7892782241106033,
      "epoch": 0.61328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20750151574611664,
      "learning_rate": 3.3526315789473685e-06,
      "loss": 0.0,
      "num_tokens": 1474848.0,
      "reward": 0.628125011920929,
      "reward_std": 0.5351123809814453,
      "rewards/reward_func/mean": 0.628125011920929,
      "rewards/reward_func/std": 0.5132942199707031,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8311872594058514,
      "epoch": 0.615234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2424483448266983,
      "learning_rate": 3.347368421052632e-06,
      "loss": 0.0,
      "num_tokens": 1479344.0,
      "reward": 0.7445999979972839,
      "reward_std": 0.49650442600250244,
      "rewards/reward_func/mean": 0.7445999979972839,
      "rewards/reward_func/std": 0.45967379212379456,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6033541187644005,
      "epoch": 0.6171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24076202511787415,
      "learning_rate": 3.342105263157895e-06,
      "loss": 0.0,
      "num_tokens": 1484192.0,
      "reward": 0.350600004196167,
      "reward_std": 0.5059750080108643,
      "rewards/reward_func/mean": 0.350600004196167,
      "rewards/reward_func/std": 0.48401686549186707,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7708289846777916,
      "epoch": 0.619140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24534860253334045,
      "learning_rate": 3.3368421052631584e-06,
      "loss": 0.0,
      "num_tokens": 1489200.0,
      "reward": 0.350600004196167,
      "reward_std": 0.4990043640136719,
      "rewards/reward_func/mean": 0.350600004196167,
      "rewards/reward_func/std": 0.4842973053455353,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7339390330016613,
      "epoch": 0.62109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24659650027751923,
      "learning_rate": 3.331578947368421e-06,
      "loss": -0.0,
      "num_tokens": 1493688.0,
      "reward": 0.518750011920929,
      "reward_std": 0.4794538617134094,
      "rewards/reward_func/mean": 0.5187499523162842,
      "rewards/reward_func/std": 0.514738142490387,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8104076609015465,
      "epoch": 0.623046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17738322913646698,
      "learning_rate": 3.3263157894736848e-06,
      "loss": 0.0,
      "num_tokens": 1498304.0,
      "reward": 0.7335000038146973,
      "reward_std": 0.28232428431510925,
      "rewards/reward_func/mean": 0.7335000038146973,
      "rewards/reward_func/std": 0.4527260363101959,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3726614834740758,
      "epoch": 0.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17689530551433563,
      "learning_rate": 3.3210526315789475e-06,
      "loss": 0.0,
      "num_tokens": 1502784.0,
      "reward": 0.8113000392913818,
      "reward_std": 0.23659999668598175,
      "rewards/reward_func/mean": 0.8113000392913818,
      "rewards/reward_func/std": 0.32786741852760315,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8716227933764458,
      "epoch": 0.626953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25260844826698303,
      "learning_rate": 3.3157894736842107e-06,
      "loss": 0.0,
      "num_tokens": 1507304.0,
      "reward": 0.9945999979972839,
      "reward_std": 0.010800004005432129,
      "rewards/reward_func/mean": 0.9945999979972839,
      "rewards/reward_func/std": 0.009998860768973827,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5032209325581789,
      "epoch": 0.62890625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11165645718574524,
      "learning_rate": 3.310526315789474e-06,
      "loss": 0.0,
      "num_tokens": 1512264.0,
      "reward": 0.1237500011920929,
      "reward_std": 0.23090852797031403,
      "rewards/reward_func/mean": 0.1237500011920929,
      "rewards/reward_func/std": 0.3300081193447113,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6931778788566589,
      "epoch": 0.630859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22502507269382477,
      "learning_rate": 3.305263157894737e-06,
      "loss": -0.0,
      "num_tokens": 1517064.0,
      "reward": 0.35860002040863037,
      "reward_std": 0.5156620740890503,
      "rewards/reward_func/mean": 0.358599990606308,
      "rewards/reward_func/std": 0.49546343088150024,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5616287402808666,
      "epoch": 0.6328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22584268450737,
      "learning_rate": 3.3000000000000006e-06,
      "loss": 0.0,
      "num_tokens": 1522032.0,
      "reward": 0.2150000035762787,
      "reward_std": 0.4300000071525574,
      "rewards/reward_func/mean": 0.2150000035762787,
      "rewards/reward_func/std": 0.39838388562202454,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6828614771366119,
      "epoch": 0.634765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1657702773809433,
      "learning_rate": 3.2947368421052633e-06,
      "loss": 0.0,
      "num_tokens": 1526888.0,
      "reward": 0.878125011920929,
      "reward_std": 0.24375002086162567,
      "rewards/reward_func/mean": 0.878125011920929,
      "rewards/reward_func/std": 0.34471458196640015,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4955676291137934,
      "epoch": 0.63671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23417575657367706,
      "learning_rate": 3.289473684210527e-06,
      "loss": -0.0,
      "num_tokens": 1531496.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.2991751432418823,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8880259320139885,
      "epoch": 0.638671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16445371508598328,
      "learning_rate": 3.2842105263157897e-06,
      "loss": 0.0,
      "num_tokens": 1535984.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7877190988510847,
      "epoch": 0.640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24739286303520203,
      "learning_rate": 3.278947368421053e-06,
      "loss": 0.0,
      "num_tokens": 1540744.0,
      "reward": 0.6195374727249146,
      "reward_std": 0.5129984021186829,
      "rewards/reward_func/mean": 0.6195374727249146,
      "rewards/reward_func/std": 0.48989206552505493,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7118626944720745,
      "epoch": 0.642578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21109792590141296,
      "learning_rate": 3.273684210526316e-06,
      "loss": 0.0,
      "num_tokens": 1545728.0,
      "reward": 0.3330000042915344,
      "reward_std": 0.47834351658821106,
      "rewards/reward_func/mean": 0.3330000042915344,
      "rewards/reward_func/std": 0.45958366990089417,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7776865549385548,
      "epoch": 0.64453125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.268421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1550560.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8135379999876022,
      "epoch": 0.646484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17627958953380585,
      "learning_rate": 3.2631578947368423e-06,
      "loss": 0.0,
      "num_tokens": 1555040.0,
      "reward": 0.878125011920929,
      "reward_std": 0.24375002086162567,
      "rewards/reward_func/mean": 0.878125011920929,
      "rewards/reward_func/std": 0.34471458196640015,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6435816325247288,
      "epoch": 0.6484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17974931001663208,
      "learning_rate": 3.2578947368421055e-06,
      "loss": 0.0,
      "num_tokens": 1559904.0,
      "reward": 0.7515624761581421,
      "reward_std": 0.2868822515010834,
      "rewards/reward_func/mean": 0.7515624761581421,
      "rewards/reward_func/std": 0.4600290060043335,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.39286974538117647,
      "epoch": 0.650390625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.252631578947369e-06,
      "loss": 0.0,
      "num_tokens": 1564416.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6735682226717472,
      "epoch": 0.65234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20429600775241852,
      "learning_rate": 3.247368421052632e-06,
      "loss": 0.0,
      "num_tokens": 1569208.0,
      "reward": 0.6209875345230103,
      "reward_std": 0.4939994215965271,
      "rewards/reward_func/mean": 0.6209875345230103,
      "rewards/reward_func/std": 0.4771609306335449,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6759085133671761,
      "epoch": 0.654296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2895135283470154,
      "learning_rate": 3.2421052631578945e-06,
      "loss": 0.0,
      "num_tokens": 1573728.0,
      "reward": 0.7593749761581421,
      "reward_std": 0.48125001788139343,
      "rewards/reward_func/mean": 0.7593749761581421,
      "rewards/reward_func/std": 0.44560104608535767,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.45824577659368515,
      "epoch": 0.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22657960653305054,
      "learning_rate": 3.236842105263158e-06,
      "loss": -0.0,
      "num_tokens": 1578240.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.46480000019073486,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7448481060564518,
      "epoch": 0.658203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17294245958328247,
      "learning_rate": 3.2315789473684213e-06,
      "loss": -0.0,
      "num_tokens": 1583088.0,
      "reward": 0.11026249825954437,
      "reward_std": 0.21637839078903198,
      "rewards/reward_func/mean": 0.11026249825954437,
      "rewards/reward_func/std": 0.3068498373031616,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48510024510324,
      "epoch": 0.66015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20591919124126434,
      "learning_rate": 3.2263157894736845e-06,
      "loss": 0.0,
      "num_tokens": 1587872.0,
      "reward": 0.484250009059906,
      "reward_std": 0.5447538495063782,
      "rewards/reward_func/mean": 0.4842499792575836,
      "rewards/reward_func/std": 0.5043662190437317,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49838624335825443,
      "epoch": 0.662109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18662779033184052,
      "learning_rate": 3.2210526315789476e-06,
      "loss": 0.0,
      "num_tokens": 1592672.0,
      "reward": 0.8531500101089478,
      "reward_std": 0.2557721734046936,
      "rewards/reward_func/mean": 0.8531500101089478,
      "rewards/reward_func/std": 0.3453153371810913,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7124339491128922,
      "epoch": 0.6640625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.23854242265224457,
      "learning_rate": 3.215789473684211e-06,
      "loss": -0.0,
      "num_tokens": 1597416.0,
      "reward": 0.8318749666213989,
      "reward_std": 0.2162500023841858,
      "rewards/reward_func/mean": 0.8318749666213989,
      "rewards/reward_func/std": 0.3058236837387085,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8913617357611656,
      "epoch": 0.666015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28833919763565063,
      "learning_rate": 3.210526315789474e-06,
      "loss": 0.0,
      "num_tokens": 1601880.0,
      "reward": 0.8769874572753906,
      "reward_std": 0.2460249960422516,
      "rewards/reward_func/mean": 0.8769875168800354,
      "rewards/reward_func/std": 0.3392883837223053,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48491984978318214,
      "epoch": 0.66796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2013910710811615,
      "learning_rate": 3.2052631578947367e-06,
      "loss": -0.0,
      "num_tokens": 1606664.0,
      "reward": 0.6037999987602234,
      "reward_std": 0.5043159127235413,
      "rewards/reward_func/mean": 0.6037999391555786,
      "rewards/reward_func/std": 0.4866776764392853,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.837503544986248,
      "epoch": 0.669921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2542946934700012,
      "learning_rate": 3.2000000000000003e-06,
      "loss": 0.0,
      "num_tokens": 1611408.0,
      "reward": 0.7344000339508057,
      "reward_std": 0.48996883630752563,
      "rewards/reward_func/mean": 0.7344000339508057,
      "rewards/reward_func/std": 0.45366016030311584,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7475596591830254,
      "epoch": 0.671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15543855726718903,
      "learning_rate": 3.1947368421052634e-06,
      "loss": 0.0,
      "num_tokens": 1615896.0,
      "reward": 0.7593749761581421,
      "reward_std": 0.27784982323646545,
      "rewards/reward_func/mean": 0.7593749761581421,
      "rewards/reward_func/std": 0.44555091857910156,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7178698666393757,
      "epoch": 0.673828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21593737602233887,
      "learning_rate": 3.1894736842105266e-06,
      "loss": 0.0,
      "num_tokens": 1620656.0,
      "reward": 0.6245999932289124,
      "reward_std": 0.25092828273773193,
      "rewards/reward_func/mean": 0.6245999932289124,
      "rewards/reward_func/std": 0.4903907775878906,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7785428650677204,
      "epoch": 0.67578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23444195091724396,
      "learning_rate": 3.1842105263157898e-06,
      "loss": -0.0,
      "num_tokens": 1625144.0,
      "reward": 0.7519874572753906,
      "reward_std": 0.48887720704078674,
      "rewards/reward_func/mean": 0.7519874572753906,
      "rewards/reward_func/std": 0.4527363181114197,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6989172957837582,
      "epoch": 0.677734375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.178947368421053e-06,
      "loss": 0.0,
      "num_tokens": 1630144.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7183702476322651,
      "epoch": 0.6796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22046923637390137,
      "learning_rate": 3.173684210526316e-06,
      "loss": 0.0,
      "num_tokens": 1634800.0,
      "reward": 0.4881250262260437,
      "reward_std": 0.5570073127746582,
      "rewards/reward_func/mean": 0.4881250262260437,
      "rewards/reward_func/std": 0.5157098770141602,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7108337618410587,
      "epoch": 0.681640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22072257101535797,
      "learning_rate": 3.168421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1639416.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6336832232773304,
      "epoch": 0.68359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16415120661258698,
      "learning_rate": 3.1631578947368424e-06,
      "loss": 0.0,
      "num_tokens": 1644272.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6293626204133034,
      "epoch": 0.685546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23304316401481628,
      "learning_rate": 3.157894736842105e-06,
      "loss": -0.0,
      "num_tokens": 1649232.0,
      "reward": 0.12349999696016312,
      "reward_std": 0.2251299023628235,
      "rewards/reward_func/mean": 0.12349999696016312,
      "rewards/reward_func/std": 0.3091523051261902,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6902714185416698,
      "epoch": 0.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25497615337371826,
      "learning_rate": 3.1526315789473688e-06,
      "loss": -0.0,
      "num_tokens": 1653984.0,
      "reward": 0.8506499528884888,
      "reward_std": 0.2553556561470032,
      "rewards/reward_func/mean": 0.8506499528884888,
      "rewards/reward_func/std": 0.3441561758518219,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6613344326615334,
      "epoch": 0.689453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22403497993946075,
      "learning_rate": 3.147368421052632e-06,
      "loss": -0.0,
      "num_tokens": 1658976.0,
      "reward": 0.12974999845027924,
      "reward_std": 0.2284284085035324,
      "rewards/reward_func/mean": 0.12974999845027924,
      "rewards/reward_func/std": 0.30694079399108887,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5704779382795095,
      "epoch": 0.69140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2441885620355606,
      "learning_rate": 3.142105263157895e-06,
      "loss": -0.0,
      "num_tokens": 1663760.0,
      "reward": 0.004687500186264515,
      "reward_std": 0.006733439397066832,
      "rewards/reward_func/mean": 0.004687500186264515,
      "rewards/reward_func/std": 0.006469365209341049,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5089486921206117,
      "epoch": 0.693359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2328837364912033,
      "learning_rate": 3.1368421052631582e-06,
      "loss": 0.0,
      "num_tokens": 1668544.0,
      "reward": 0.3531000018119812,
      "reward_std": 0.5088821649551392,
      "rewards/reward_func/mean": 0.3531000018119812,
      "rewards/reward_func/std": 0.487506628036499,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.712410818785429,
      "epoch": 0.6953125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14800521731376648,
      "learning_rate": 3.131578947368421e-06,
      "loss": -0.0,
      "num_tokens": 1673408.0,
      "reward": 0.625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6078695468604565,
      "epoch": 0.697265625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.21499896049499512,
      "learning_rate": 3.1263157894736846e-06,
      "loss": 0.0,
      "num_tokens": 1677888.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7318508177995682,
      "epoch": 0.69921875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18809860944747925,
      "learning_rate": 3.1210526315789473e-06,
      "loss": 0.0,
      "num_tokens": 1682352.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5955179370939732,
      "epoch": 0.701171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19998441636562347,
      "learning_rate": 3.115789473684211e-06,
      "loss": -0.0,
      "num_tokens": 1687200.0,
      "reward": 0.6271250247955322,
      "reward_std": 0.5026910901069641,
      "rewards/reward_func/mean": 0.6271250247955322,
      "rewards/reward_func/std": 0.4851891100406647,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4728749534115195,
      "epoch": 0.703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20382100343704224,
      "learning_rate": 3.1105263157894736e-06,
      "loss": 0.0,
      "num_tokens": 1691696.0,
      "reward": 0.8113000392913818,
      "reward_std": 0.23659999668598175,
      "rewards/reward_func/mean": 0.8113000392913818,
      "rewards/reward_func/std": 0.32786741852760315,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6648149769753218,
      "epoch": 0.705078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22265951335430145,
      "learning_rate": 3.1052631578947372e-06,
      "loss": -0.0,
      "num_tokens": 1696360.0,
      "reward": 0.6027500033378601,
      "reward_std": 0.5180938243865967,
      "rewards/reward_func/mean": 0.6027500033378601,
      "rewards/reward_func/std": 0.5003756880760193,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5913132168352604,
      "epoch": 0.70703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21587161719799042,
      "learning_rate": 3.1000000000000004e-06,
      "loss": 0.0,
      "num_tokens": 1701144.0,
      "reward": 0.7343375086784363,
      "reward_std": 0.4605855941772461,
      "rewards/reward_func/mean": 0.7343375086784363,
      "rewards/reward_func/std": 0.42718806862831116,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4091036804020405,
      "epoch": 0.708984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17123205959796906,
      "learning_rate": 3.094736842105263e-06,
      "loss": -0.0,
      "num_tokens": 1705640.0,
      "reward": 0.5841249823570251,
      "reward_std": 0.4971931576728821,
      "rewards/reward_func/mean": 0.5841249823570251,
      "rewards/reward_func/std": 0.47686323523521423,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6606861557811499,
      "epoch": 0.7109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23936234414577484,
      "learning_rate": 3.0894736842105267e-06,
      "loss": -0.0,
      "num_tokens": 1710288.0,
      "reward": 0.504687488079071,
      "reward_std": 0.5720410346984863,
      "rewards/reward_func/mean": 0.504687488079071,
      "rewards/reward_func/std": 0.5296536087989807,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5874370019882917,
      "epoch": 0.712890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23465895652770996,
      "learning_rate": 3.0842105263157895e-06,
      "loss": -0.0,
      "num_tokens": 1714944.0,
      "reward": 0.6230999827384949,
      "reward_std": 0.519683837890625,
      "rewards/reward_func/mean": 0.6230999827384949,
      "rewards/reward_func/std": 0.5026271939277649,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8287493027746677,
      "epoch": 0.71484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24288125336170197,
      "learning_rate": 3.078947368421053e-06,
      "loss": -0.0,
      "num_tokens": 1719696.0,
      "reward": 0.6187999844551086,
      "reward_std": 0.5346147418022156,
      "rewards/reward_func/mean": 0.6187999844551086,
      "rewards/reward_func/std": 0.5125207901000977,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5209563355892897,
      "epoch": 0.716796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23250123858451843,
      "learning_rate": 3.0736842105263158e-06,
      "loss": 0.0,
      "num_tokens": 1724664.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/reward_func/mean": 0.0062500000931322575,
      "rewards/reward_func/std": 0.011572751216590405,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6145099401473999,
      "epoch": 0.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22183161973953247,
      "learning_rate": 3.0684210526315794e-06,
      "loss": -0.0,
      "num_tokens": 1729424.0,
      "reward": 0.5003125071525574,
      "reward_std": 0.5600941181182861,
      "rewards/reward_func/mean": 0.5003125071525574,
      "rewards/reward_func/std": 0.5186700820922852,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8084001671522856,
      "epoch": 0.720703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2436615526676178,
      "learning_rate": 3.0631578947368425e-06,
      "loss": 0.0,
      "num_tokens": 1734176.0,
      "reward": 0.4699999988079071,
      "reward_std": 0.542709231376648,
      "rewards/reward_func/mean": 0.4699999988079071,
      "rewards/reward_func/std": 0.5024511218070984,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6762276217341423,
      "epoch": 0.72265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23068293929100037,
      "learning_rate": 3.0578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 1738800.0,
      "reward": 0.6190624833106995,
      "reward_std": 0.5181007981300354,
      "rewards/reward_func/mean": 0.6190624833106995,
      "rewards/reward_func/std": 0.4957561790943146,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6614982299506664,
      "epoch": 0.724609375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14307619631290436,
      "learning_rate": 3.052631578947369e-06,
      "loss": 0.0,
      "num_tokens": 1743480.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.559177003800869,
      "epoch": 0.7265625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15780669450759888,
      "learning_rate": 3.0473684210526316e-06,
      "loss": 0.0,
      "num_tokens": 1747824.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6498337648808956,
      "epoch": 0.728515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21541620790958405,
      "learning_rate": 3.042105263157895e-06,
      "loss": -0.0,
      "num_tokens": 1752608.0,
      "reward": 0.24124999344348907,
      "reward_std": 0.4659823775291443,
      "rewards/reward_func/mean": 0.24124999344348907,
      "rewards/reward_func/std": 0.4314158856868744,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5173940751701593,
      "epoch": 0.73046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14444416761398315,
      "learning_rate": 3.036842105263158e-06,
      "loss": -0.0,
      "num_tokens": 1757576.0,
      "reward": 0.0015625000232830644,
      "reward_std": 0.0031250000465661287,
      "rewards/reward_func/mean": 0.0015625000232830644,
      "rewards/reward_func/std": 0.0044194175861775875,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5764465928077698,
      "epoch": 0.732421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1886185109615326,
      "learning_rate": 3.0315789473684215e-06,
      "loss": -0.0,
      "num_tokens": 1762504.0,
      "reward": 0.2711625099182129,
      "reward_std": 0.29757219552993774,
      "rewards/reward_func/mean": 0.2711625099182129,
      "rewards/reward_func/std": 0.437508225440979,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8418979570269585,
      "epoch": 0.734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22740621864795685,
      "learning_rate": 3.0263157894736843e-06,
      "loss": 0.0,
      "num_tokens": 1767144.0,
      "reward": 0.35471248626708984,
      "reward_std": 0.49858343601226807,
      "rewards/reward_func/mean": 0.35471248626708984,
      "rewards/reward_func/std": 0.4733320474624634,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9156858287751675,
      "epoch": 0.736328125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.2001858800649643,
      "learning_rate": 3.0210526315789474e-06,
      "loss": 0.0,
      "num_tokens": 1771736.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6596429236233234,
      "epoch": 0.73828125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1892218291759491,
      "learning_rate": 3.015789473684211e-06,
      "loss": 0.0,
      "num_tokens": 1776168.0,
      "reward": 0.7515624761581421,
      "reward_std": 0.2868822515010834,
      "rewards/reward_func/mean": 0.7515624761581421,
      "rewards/reward_func/std": 0.4600290060043335,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5691553540527821,
      "epoch": 0.740234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20194444060325623,
      "learning_rate": 3.0105263157894737e-06,
      "loss": -0.0,
      "num_tokens": 1780904.0,
      "reward": 0.8247499465942383,
      "reward_std": 0.2395000010728836,
      "rewards/reward_func/mean": 0.8247500061988831,
      "rewards/reward_func/std": 0.3333088457584381,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.0326480194926262,
      "epoch": 0.7421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22807976603507996,
      "learning_rate": 3.0052631578947373e-06,
      "loss": 0.0,
      "num_tokens": 1785640.0,
      "reward": 0.6212124824523926,
      "reward_std": 0.2459665983915329,
      "rewards/reward_func/mean": 0.6212124824523926,
      "rewards/reward_func/std": 0.4976014196872711,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8465277478098869,
      "epoch": 0.744140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33321791887283325,
      "learning_rate": 3e-06,
      "loss": 0.0,
      "num_tokens": 1790168.0,
      "reward": 0.8723000288009644,
      "reward_std": 0.25540000200271606,
      "rewards/reward_func/mean": 0.8723000288009644,
      "rewards/reward_func/std": 0.3525434732437134,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.2553594410419464,
      "epoch": 0.74609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17427082359790802,
      "learning_rate": 2.9947368421052637e-06,
      "loss": 0.0,
      "num_tokens": 1794664.0,
      "reward": 0.5823000073432922,
      "reward_std": 0.5016319751739502,
      "rewards/reward_func/mean": 0.5823000073432922,
      "rewards/reward_func/std": 0.48220303654670715,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6799057051539421,
      "epoch": 0.748046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2410704791545868,
      "learning_rate": 2.9894736842105264e-06,
      "loss": 0.0,
      "num_tokens": 1799424.0,
      "reward": 0.7316499948501587,
      "reward_std": 0.4879480004310608,
      "rewards/reward_func/mean": 0.7316499948501587,
      "rewards/reward_func/std": 0.45203354954719543,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47508667781949043,
      "epoch": 0.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2326003611087799,
      "learning_rate": 2.9842105263157896e-06,
      "loss": -0.0,
      "num_tokens": 1804088.0,
      "reward": 0.265625,
      "reward_std": 0.4900358021259308,
      "rewards/reward_func/mean": 0.265625,
      "rewards/reward_func/std": 0.4540861248970032,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8876021355390549,
      "epoch": 0.751953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24729052186012268,
      "learning_rate": 2.9789473684210527e-06,
      "loss": 0.0,
      "num_tokens": 1808592.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.2940751314163208,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.44519135542213917,
      "epoch": 0.75390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18563015758991241,
      "learning_rate": 2.973684210526316e-06,
      "loss": 0.0,
      "num_tokens": 1813552.0,
      "reward": 0.22750000655651093,
      "reward_std": 0.25122225284576416,
      "rewards/reward_func/mean": 0.22749999165534973,
      "rewards/reward_func/std": 0.39082661271095276,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.740038525313139,
      "epoch": 0.755859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20772859454154968,
      "learning_rate": 2.9684210526315795e-06,
      "loss": -0.0,
      "num_tokens": 1818296.0,
      "reward": 0.6243749856948853,
      "reward_std": 0.2402016967535019,
      "rewards/reward_func/mean": 0.6243749856948853,
      "rewards/reward_func/std": 0.4830171763896942,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7946259453892708,
      "epoch": 0.7578125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14700205624103546,
      "learning_rate": 2.9631578947368422e-06,
      "loss": 0.0,
      "num_tokens": 1823056.0,
      "reward": 0.9474999904632568,
      "reward_std": 0.01500000525265932,
      "rewards/reward_func/mean": 0.9474999904632568,
      "rewards/reward_func/std": 0.02121320553123951,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5914437137544155,
      "epoch": 0.759765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.21744617819786072,
      "learning_rate": 2.957894736842106e-06,
      "loss": 0.0,
      "num_tokens": 1827400.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6300115957856178,
      "epoch": 0.76171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16123859584331512,
      "learning_rate": 2.9526315789473685e-06,
      "loss": -0.0,
      "num_tokens": 1832408.0,
      "reward": 0.2418999969959259,
      "reward_std": 0.27935683727264404,
      "rewards/reward_func/mean": 0.2418999969959259,
      "rewards/reward_func/std": 0.44794899225234985,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6575647704303265,
      "epoch": 0.763671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1817922294139862,
      "learning_rate": 2.9473684210526317e-06,
      "loss": -0.0,
      "num_tokens": 1837072.0,
      "reward": 0.3765625059604645,
      "reward_std": 0.5368822813034058,
      "rewards/reward_func/mean": 0.3765625059604645,
      "rewards/reward_func/std": 0.5162726044654846,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8860741257667542,
      "epoch": 0.765625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.942105263157895e-06,
      "loss": 0.0,
      "num_tokens": 1841600.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5626347567886114,
      "epoch": 0.767578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20179854333400726,
      "learning_rate": 2.936842105263158e-06,
      "loss": 0.0,
      "num_tokens": 1846392.0,
      "reward": 0.36006247997283936,
      "reward_std": 0.5131810903549194,
      "rewards/reward_func/mean": 0.36006247997283936,
      "rewards/reward_func/std": 0.493501216173172,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6695420295000076,
      "epoch": 0.76953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25772982835769653,
      "learning_rate": 2.931578947368421e-06,
      "loss": 0.0,
      "num_tokens": 1851360.0,
      "reward": 0.11412499845027924,
      "reward_std": 0.22824999690055847,
      "rewards/reward_func/mean": 0.11412499845027924,
      "rewards/reward_func/std": 0.3128150701522827,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6333671249449253,
      "epoch": 0.771484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2417721301317215,
      "learning_rate": 2.9263157894736844e-06,
      "loss": -0.0,
      "num_tokens": 1856336.0,
      "reward": 0.12037499994039536,
      "reward_std": 0.22721248865127563,
      "rewards/reward_func/mean": 0.12037499994039536,
      "rewards/reward_func/std": 0.31041398644447327,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6140784211456776,
      "epoch": 0.7734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24161596596240997,
      "learning_rate": 2.921052631578948e-06,
      "loss": 0.0,
      "num_tokens": 1861136.0,
      "reward": 0.6094000339508057,
      "reward_std": 0.5234606862068176,
      "rewards/reward_func/mean": 0.6094000339508057,
      "rewards/reward_func/std": 0.5048978924751282,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.416771961376071,
      "epoch": 0.775390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19176459312438965,
      "learning_rate": 2.9157894736842107e-06,
      "loss": -0.0,
      "num_tokens": 1865976.0,
      "reward": 0.5075249671936035,
      "reward_std": 0.48087674379348755,
      "rewards/reward_func/mean": 0.5075249671936035,
      "rewards/reward_func/std": 0.5095809102058411,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7132963538169861,
      "epoch": 0.77734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21432211995124817,
      "learning_rate": 2.9105263157894743e-06,
      "loss": 0.0,
      "num_tokens": 1870416.0,
      "reward": 0.8723000288009644,
      "reward_std": 0.25540000200271606,
      "rewards/reward_func/mean": 0.8723000288009644,
      "rewards/reward_func/std": 0.3525434732437134,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6518449485301971,
      "epoch": 0.779296875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16880500316619873,
      "learning_rate": 2.905263157894737e-06,
      "loss": 0.0,
      "num_tokens": 1875048.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8165195435285568,
      "epoch": 0.78125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15151342749595642,
      "learning_rate": 2.9e-06,
      "loss": 0.0,
      "num_tokens": 1879520.0,
      "reward": 0.878125011920929,
      "reward_std": 0.24375002086162567,
      "rewards/reward_func/mean": 0.878125011920929,
      "rewards/reward_func/std": 0.34471458196640015,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5633701048791409,
      "epoch": 0.783203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.181041419506073,
      "learning_rate": 2.8947368421052634e-06,
      "loss": 0.0,
      "num_tokens": 1884368.0,
      "reward": 0.23649999499320984,
      "reward_std": 0.27312225103378296,
      "rewards/reward_func/mean": 0.23649999499320984,
      "rewards/reward_func/std": 0.4379509389400482,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.32244748808443546,
      "epoch": 0.78515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17149122059345245,
      "learning_rate": 2.8894736842105265e-06,
      "loss": 0.0,
      "num_tokens": 1888872.0,
      "reward": 0.6985000371932983,
      "reward_std": 0.270952433347702,
      "rewards/reward_func/mean": 0.6985000371932983,
      "rewards/reward_func/std": 0.4311384856700897,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9212587662041187,
      "epoch": 0.787109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3085295259952545,
      "learning_rate": 2.88421052631579e-06,
      "loss": -0.0,
      "num_tokens": 1893352.0,
      "reward": 0.6285499930381775,
      "reward_std": 0.5285625457763672,
      "rewards/reward_func/mean": 0.6285499930381775,
      "rewards/reward_func/std": 0.5069750547409058,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7726543471217155,
      "epoch": 0.7890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23899753391742706,
      "learning_rate": 2.878947368421053e-06,
      "loss": -0.0,
      "num_tokens": 1898088.0,
      "reward": 0.8645749688148499,
      "reward_std": 0.25090357661247253,
      "rewards/reward_func/mean": 0.8645749688148499,
      "rewards/reward_func/std": 0.3395808935165405,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5518805952742696,
      "epoch": 0.791015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20844319462776184,
      "learning_rate": 2.8736842105263164e-06,
      "loss": -0.0,
      "num_tokens": 1903064.0,
      "reward": 0.2329374998807907,
      "reward_std": 0.2587106227874756,
      "rewards/reward_func/mean": 0.2329374998807907,
      "rewards/reward_func/std": 0.40477776527404785,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.27654310688376427,
      "epoch": 0.79296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16797520220279694,
      "learning_rate": 2.868421052631579e-06,
      "loss": 0.0,
      "num_tokens": 1907568.0,
      "reward": 0.6930000185966492,
      "reward_std": 0.27320215106010437,
      "rewards/reward_func/mean": 0.6930000185966492,
      "rewards/reward_func/std": 0.4277917444705963,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6657938584685326,
      "epoch": 0.794921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26008403301239014,
      "learning_rate": 2.8631578947368423e-06,
      "loss": -0.0,
      "num_tokens": 1912408.0,
      "reward": 0.3424000144004822,
      "reward_std": 0.4964672327041626,
      "rewards/reward_func/mean": 0.3424000144004822,
      "rewards/reward_func/std": 0.47299036383628845,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5552172213792801,
      "epoch": 0.796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2306947261095047,
      "learning_rate": 2.8578947368421055e-06,
      "loss": 0.0,
      "num_tokens": 1917192.0,
      "reward": 0.3631874918937683,
      "reward_std": 0.5111160278320312,
      "rewards/reward_func/mean": 0.3631874918937683,
      "rewards/reward_func/std": 0.4909226894378662,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9417452700436115,
      "epoch": 0.798828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26336851716041565,
      "learning_rate": 2.8526315789473687e-06,
      "loss": 0.0,
      "num_tokens": 1921688.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5507835112512112,
      "epoch": 0.80078125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.847368421052632e-06,
      "loss": 0.0,
      "num_tokens": 1926336.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8155883587896824,
      "epoch": 0.802734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25022512674331665,
      "learning_rate": 2.842105263157895e-06,
      "loss": -0.0,
      "num_tokens": 1930944.0,
      "reward": 0.9845499992370605,
      "reward_std": 0.025080181658267975,
      "rewards/reward_func/mean": 0.9845499992370605,
      "rewards/reward_func/std": 0.02366715855896473,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.2771544838324189,
      "epoch": 0.8046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11498188227415085,
      "learning_rate": 2.8368421052631586e-06,
      "loss": -0.0,
      "num_tokens": 1935456.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5674882866442204,
      "epoch": 0.806640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.222222238779068,
      "learning_rate": 2.8315789473684213e-06,
      "loss": 0.0,
      "num_tokens": 1940104.0,
      "reward": 0.4912000000476837,
      "reward_std": 0.5675593614578247,
      "rewards/reward_func/mean": 0.4912000000476837,
      "rewards/reward_func/std": 0.5256202816963196,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.884392186999321,
      "epoch": 0.80859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2370668649673462,
      "learning_rate": 2.826315789473684e-06,
      "loss": 0.0,
      "num_tokens": 1944632.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.5703184008598328,
      "rewards/reward_func/mean": 0.5062500238418579,
      "rewards/reward_func/std": 0.5280946493148804,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5134440064430237,
      "epoch": 0.810546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18468965590000153,
      "learning_rate": 2.8210526315789476e-06,
      "loss": 0.0,
      "num_tokens": 1949600.0,
      "reward": 0.13600000739097595,
      "reward_std": 0.2231663018465042,
      "rewards/reward_func/mean": 0.13600000739097595,
      "rewards/reward_func/std": 0.30529868602752686,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6705011464655399,
      "epoch": 0.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22873243689537048,
      "learning_rate": 2.815789473684211e-06,
      "loss": 0.0,
      "num_tokens": 1954392.0,
      "reward": 0.4972499907016754,
      "reward_std": 0.5742101669311523,
      "rewards/reward_func/mean": 0.4972499907016754,
      "rewards/reward_func/std": 0.5316314101219177,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9413636922836304,
      "epoch": 0.814453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22522786259651184,
      "learning_rate": 2.810526315789474e-06,
      "loss": 0.0,
      "num_tokens": 1958888.0,
      "reward": 0.6312500238418579,
      "reward_std": 0.5315045118331909,
      "rewards/reward_func/mean": 0.6312500238418579,
      "rewards/reward_func/std": 0.5090256929397583,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7534596920013428,
      "epoch": 0.81640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2349129617214203,
      "learning_rate": 2.805263157894737e-06,
      "loss": 0.0,
      "num_tokens": 1963752.0,
      "reward": 0.503125011920929,
      "reward_std": 0.5737417936325073,
      "rewards/reward_func/mean": 0.503125011920929,
      "rewards/reward_func/std": 0.5312027335166931,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7274225316941738,
      "epoch": 0.818359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17489460110664368,
      "learning_rate": 2.8000000000000003e-06,
      "loss": 0.0,
      "num_tokens": 1968624.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6172376144677401,
      "epoch": 0.8203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2108323872089386,
      "learning_rate": 2.7947368421052635e-06,
      "loss": -0.0,
      "num_tokens": 1973280.0,
      "reward": 0.754687488079071,
      "reward_std": 0.4906249940395355,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4672897644340992,
      "epoch": 0.822265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.203911691904068,
      "learning_rate": 2.789473684210526e-06,
      "loss": -0.0,
      "num_tokens": 1978264.0,
      "reward": 0.12350000441074371,
      "reward_std": 0.23065190017223358,
      "rewards/reward_func/mean": 0.12350000441074371,
      "rewards/reward_func/std": 0.30972936749458313,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.40012666769325733,
      "epoch": 0.82421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19789475202560425,
      "learning_rate": 2.7842105263157898e-06,
      "loss": 0.0,
      "num_tokens": 1983232.0,
      "reward": 0.1172500029206276,
      "reward_std": 0.2345000058412552,
      "rewards/reward_func/mean": 0.1172500029206276,
      "rewards/reward_func/std": 0.3119211196899414,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6508579887449741,
      "epoch": 0.826171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13198713958263397,
      "learning_rate": 2.7789473684210525e-06,
      "loss": -0.0,
      "num_tokens": 1987864.0,
      "reward": 0.9679999947547913,
      "reward_std": 0.008000005967915058,
      "rewards/reward_func/mean": 0.9679999947547913,
      "rewards/reward_func/std": 0.011313710361719131,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8873192630708218,
      "epoch": 0.828125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17348502576351166,
      "learning_rate": 2.773684210526316e-06,
      "loss": -0.0,
      "num_tokens": 1992728.0,
      "reward": 0.9947500228881836,
      "reward_std": 0.010500003583729267,
      "rewards/reward_func/mean": 0.9947500228881836,
      "rewards/reward_func/std": 0.014849240891635418,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4722721241414547,
      "epoch": 0.830078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2051362246274948,
      "learning_rate": 2.7684210526315793e-06,
      "loss": 0.0,
      "num_tokens": 1997376.0,
      "reward": 0.49486249685287476,
      "reward_std": 0.4869329631328583,
      "rewards/reward_func/mean": 0.49486249685287476,
      "rewards/reward_func/std": 0.5131271481513977,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.29726838506758213,
      "epoch": 0.83203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11943374574184418,
      "learning_rate": 2.7631578947368424e-06,
      "loss": 0.0,
      "num_tokens": 2001864.0,
      "reward": 0.8113000392913818,
      "reward_std": 0.23103395104408264,
      "rewards/reward_func/mean": 0.8113000392913818,
      "rewards/reward_func/std": 0.32786741852760315,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5415227115154266,
      "epoch": 0.833984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2265365570783615,
      "learning_rate": 2.7578947368421056e-06,
      "loss": -0.0,
      "num_tokens": 2006496.0,
      "reward": 0.6188374757766724,
      "reward_std": 0.2559331953525543,
      "rewards/reward_func/mean": 0.6188374757766724,
      "rewards/reward_func/std": 0.5024172067642212,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6843532212078571,
      "epoch": 0.8359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21580831706523895,
      "learning_rate": 2.7526315789473683e-06,
      "loss": -0.0,
      "num_tokens": 2011144.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.01781497895717621,
      "rewards/reward_func/mean": 0.012500000186264515,
      "rewards/reward_func/std": 0.01767767034471035,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5906918495893478,
      "epoch": 0.837890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2688174843788147,
      "learning_rate": 2.747368421052632e-06,
      "loss": 0.0,
      "num_tokens": 2016000.0,
      "reward": 0.22010000050067902,
      "reward_std": 0.44020000100135803,
      "rewards/reward_func/mean": 0.22010000050067902,
      "rewards/reward_func/std": 0.40758687257766724,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.541338412091136,
      "epoch": 0.83984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25273582339286804,
      "learning_rate": 2.7421052631578947e-06,
      "loss": 0.0,
      "num_tokens": 2020848.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5117669757455587,
      "epoch": 0.841796875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14728084206581116,
      "learning_rate": 2.7368421052631583e-06,
      "loss": -0.0,
      "num_tokens": 2026016.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.28164324164390564,
      "rewards/reward_func/mean": 0.7562500238418579,
      "rewards/reward_func/std": 0.45153507590293884,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7267967909574509,
      "epoch": 0.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23854367434978485,
      "learning_rate": 2.7315789473684214e-06,
      "loss": 0.0,
      "num_tokens": 2030440.0,
      "reward": 0.7425000071525574,
      "reward_std": 0.4954078197479248,
      "rewards/reward_func/mean": 0.7425000071525574,
      "rewards/reward_func/std": 0.45874831080436707,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.582302026450634,
      "epoch": 0.845703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1844756305217743,
      "learning_rate": 2.7263157894736846e-06,
      "loss": -0.0,
      "num_tokens": 2035200.0,
      "reward": 0.8224999904632568,
      "reward_std": 0.23499999940395355,
      "rewards/reward_func/mean": 0.8224999904632568,
      "rewards/reward_func/std": 0.33234018087387085,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.816022414714098,
      "epoch": 0.84765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23570871353149414,
      "learning_rate": 2.7210526315789478e-06,
      "loss": -0.0,
      "num_tokens": 2039984.0,
      "reward": 0.35731250047683716,
      "reward_std": 0.496307909488678,
      "rewards/reward_func/mean": 0.35731250047683716,
      "rewards/reward_func/std": 0.4760379195213318,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48291225358843803,
      "epoch": 0.849609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18518958985805511,
      "learning_rate": 2.7157894736842105e-06,
      "loss": -0.0,
      "num_tokens": 2044624.0,
      "reward": 0.512499988079071,
      "reward_std": 0.5633107423782349,
      "rewards/reward_func/mean": 0.512499988079071,
      "rewards/reward_func/std": 0.5215447545051575,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6258512511849403,
      "epoch": 0.8515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20368583500385284,
      "learning_rate": 2.710526315789474e-06,
      "loss": -0.0,
      "num_tokens": 2049472.0,
      "reward": 0.23640000820159912,
      "reward_std": 0.47280001640319824,
      "rewards/reward_func/mean": 0.23640000820159912,
      "rewards/reward_func/std": 0.43869248032569885,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5136358644813299,
      "epoch": 0.853515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2028164565563202,
      "learning_rate": 2.705263157894737e-06,
      "loss": -0.0,
      "num_tokens": 2054240.0,
      "reward": 0.25462499260902405,
      "reward_std": 0.2947234511375427,
      "rewards/reward_func/mean": 0.25462499260902405,
      "rewards/reward_func/std": 0.43380802869796753,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9727239087224007,
      "epoch": 0.85546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2779141068458557,
      "learning_rate": 2.7000000000000004e-06,
      "loss": -0.0,
      "num_tokens": 2058760.0,
      "reward": 0.512499988079071,
      "reward_std": 0.5629165172576904,
      "rewards/reward_func/mean": 0.512499988079071,
      "rewards/reward_func/std": 0.5215019583702087,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7157435975968838,
      "epoch": 0.857421875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.694736842105263e-06,
      "loss": 0.0,
      "num_tokens": 2063616.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.480139946565032,
      "epoch": 0.859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24946144223213196,
      "learning_rate": 2.6894736842105267e-06,
      "loss": 0.0,
      "num_tokens": 2068552.0,
      "reward": 0.2633500099182129,
      "reward_std": 0.47686171531677246,
      "rewards/reward_func/mean": 0.2633500099182129,
      "rewards/reward_func/std": 0.44164177775382996,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5838107727468014,
      "epoch": 0.861328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24031829833984375,
      "learning_rate": 2.68421052631579e-06,
      "loss": 0.0,
      "num_tokens": 2073352.0,
      "reward": 0.7457625269889832,
      "reward_std": 0.46000391244888306,
      "rewards/reward_func/mean": 0.7457624673843384,
      "rewards/reward_func/std": 0.4260120391845703,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4704406224191189,
      "epoch": 0.86328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23864911496639252,
      "learning_rate": 2.6789473684210526e-06,
      "loss": -0.0,
      "num_tokens": 2078136.0,
      "reward": 0.2421249896287918,
      "reward_std": 0.4759899377822876,
      "rewards/reward_func/mean": 0.2421249896287918,
      "rewards/reward_func/std": 0.44069764018058777,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49487001448869705,
      "epoch": 0.865234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19437025487422943,
      "learning_rate": 2.6736842105263162e-06,
      "loss": 0.0,
      "num_tokens": 2082904.0,
      "reward": 0.2421249896287918,
      "reward_std": 0.27958187460899353,
      "rewards/reward_func/mean": 0.2421249896287918,
      "rewards/reward_func/std": 0.4406470060348511,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6033957190811634,
      "epoch": 0.8671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21896040439605713,
      "learning_rate": 2.668421052631579e-06,
      "loss": -0.0,
      "num_tokens": 2087560.0,
      "reward": 0.5024124979972839,
      "reward_std": 0.4808311462402344,
      "rewards/reward_func/mean": 0.5024124979972839,
      "rewards/reward_func/std": 0.5208636522293091,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5963526703417301,
      "epoch": 0.869140625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17856237292289734,
      "learning_rate": 2.6631578947368426e-06,
      "loss": 0.0,
      "num_tokens": 2092416.0,
      "reward": 0.8560999631881714,
      "reward_std": 0.24459999799728394,
      "rewards/reward_func/mean": 0.8560999631881714,
      "rewards/reward_func/std": 0.34591665863990784,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49352680519223213,
      "epoch": 0.87109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24865403771400452,
      "learning_rate": 2.6578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 2097096.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.4966987371444702,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6915704943239689,
      "epoch": 0.873046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2585187554359436,
      "learning_rate": 2.652631578947369e-06,
      "loss": 0.0,
      "num_tokens": 2102104.0,
      "reward": 0.2282000035047531,
      "reward_std": 0.4564000070095062,
      "rewards/reward_func/mean": 0.2282000035047531,
      "rewards/reward_func/std": 0.42270201444625854,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7159839831292629,
      "epoch": 0.875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.20453768968582153,
      "learning_rate": 2.6473684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2107120.0,
      "reward": 0.11140000075101852,
      "reward_std": 0.22280000150203705,
      "rewards/reward_func/mean": 0.11140000075101852,
      "rewards/reward_func/std": 0.3150867819786072,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6321196630597115,
      "epoch": 0.876953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2514483332633972,
      "learning_rate": 2.6421052631578948e-06,
      "loss": -0.0,
      "num_tokens": 2111864.0,
      "reward": 0.7442624568939209,
      "reward_std": 0.47556716203689575,
      "rewards/reward_func/mean": 0.7442624568939209,
      "rewards/reward_func/std": 0.4406154155731201,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5473988149315119,
      "epoch": 0.87890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2295861393213272,
      "learning_rate": 2.6368421052631584e-06,
      "loss": 0.0,
      "num_tokens": 2116800.0,
      "reward": 0.48919999599456787,
      "reward_std": 0.48919999599456787,
      "rewards/reward_func/mean": 0.48919999599456787,
      "rewards/reward_func/std": 0.522976815700531,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6474330313503742,
      "epoch": 0.880859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13752923905849457,
      "learning_rate": 2.631578947368421e-06,
      "loss": 0.0,
      "num_tokens": 2121664.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47194433584809303,
      "epoch": 0.8828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22527579963207245,
      "learning_rate": 2.6263157894736847e-06,
      "loss": 0.0,
      "num_tokens": 2126328.0,
      "reward": 0.5039750337600708,
      "reward_std": 0.560730516910553,
      "rewards/reward_func/mean": 0.5039750337600708,
      "rewards/reward_func/std": 0.5192855596542358,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.696033101528883,
      "epoch": 0.884765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21090690791606903,
      "learning_rate": 2.6210526315789474e-06,
      "loss": -0.0,
      "num_tokens": 2130864.0,
      "reward": 0.7397750020027161,
      "reward_std": 0.28458285331726074,
      "rewards/reward_func/mean": 0.7397749423980713,
      "rewards/reward_func/std": 0.4491344392299652,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5411256179213524,
      "epoch": 0.88671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18700988590717316,
      "learning_rate": 2.615789473684211e-06,
      "loss": 0.0,
      "num_tokens": 2135832.0,
      "reward": 0.3330000042915344,
      "reward_std": 0.47834351658821106,
      "rewards/reward_func/mean": 0.3330000042915344,
      "rewards/reward_func/std": 0.45958366990089417,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.602843264117837,
      "epoch": 0.888671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19545380771160126,
      "learning_rate": 2.6105263157894738e-06,
      "loss": -0.0,
      "num_tokens": 2140624.0,
      "reward": 0.24056249856948853,
      "reward_std": 0.47697657346725464,
      "rewards/reward_func/mean": 0.24056249856948853,
      "rewards/reward_func/std": 0.44159868359565735,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7841911315917969,
      "epoch": 0.890625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16834405064582825,
      "learning_rate": 2.605263157894737e-06,
      "loss": 0.0,
      "num_tokens": 2145096.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.2814582586288452,
      "rewards/reward_func/mean": 0.7562500238418579,
      "rewards/reward_func/std": 0.4513373076915741,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4610623624175787,
      "epoch": 0.892578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22504417598247528,
      "learning_rate": 2.6e-06,
      "loss": 0.0,
      "num_tokens": 2149848.0,
      "reward": 0.7124999761581421,
      "reward_std": 0.47541630268096924,
      "rewards/reward_func/mean": 0.7125000357627869,
      "rewards/reward_func/std": 0.44025152921676636,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5260163843631744,
      "epoch": 0.89453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2690121531486511,
      "learning_rate": 2.5947368421052633e-06,
      "loss": 0.0,
      "num_tokens": 2154696.0,
      "reward": 0.350600004196167,
      "reward_std": 0.4991450905799866,
      "rewards/reward_func/mean": 0.350600004196167,
      "rewards/reward_func/std": 0.48443493247032166,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8946198597550392,
      "epoch": 0.896484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25450319051742554,
      "learning_rate": 2.589473684210527e-06,
      "loss": 0.0,
      "num_tokens": 2159176.0,
      "reward": 0.9945999979972839,
      "reward_std": 0.010800004005432129,
      "rewards/reward_func/mean": 0.9945999979972839,
      "rewards/reward_func/std": 0.009998860768973827,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3343417225405574,
      "epoch": 0.8984375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.23701933026313782,
      "learning_rate": 2.5842105263157896e-06,
      "loss": -0.0,
      "num_tokens": 2163664.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5373089350759983,
      "epoch": 0.900390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14844948053359985,
      "learning_rate": 2.578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 2168448.0,
      "reward": 0.23899999260902405,
      "reward_std": 0.27597343921661377,
      "rewards/reward_func/mean": 0.23899999260902405,
      "rewards/reward_func/std": 0.44254201650619507,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6289132609963417,
      "epoch": 0.90234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2263883650302887,
      "learning_rate": 2.573684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2173248.0,
      "reward": 0.6249375343322754,
      "reward_std": 0.5195988416671753,
      "rewards/reward_func/mean": 0.6249375343322754,
      "rewards/reward_func/std": 0.4944932758808136,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9023691639304161,
      "epoch": 0.904296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3016529679298401,
      "learning_rate": 2.568421052631579e-06,
      "loss": 0.0,
      "num_tokens": 2177720.0,
      "reward": 0.6285499930381775,
      "reward_std": 0.25130394101142883,
      "rewards/reward_func/mean": 0.6285499930381775,
      "rewards/reward_func/std": 0.5067988634109497,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5828330963850021,
      "epoch": 0.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20633725821971893,
      "learning_rate": 2.5631578947368422e-06,
      "loss": 0.0,
      "num_tokens": 2182560.0,
      "reward": 0.6124000549316406,
      "reward_std": 0.48071789741516113,
      "rewards/reward_func/mean": 0.6124000549316406,
      "rewards/reward_func/std": 0.46755528450012207,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.421830615028739,
      "epoch": 0.908203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1946926712989807,
      "learning_rate": 2.5578947368421054e-06,
      "loss": -0.0,
      "num_tokens": 2187512.0,
      "reward": 0.24700000882148743,
      "reward_std": 0.2572190761566162,
      "rewards/reward_func/mean": 0.24700000882148743,
      "rewards/reward_func/std": 0.39629149436950684,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.38339319732040167,
      "epoch": 0.91015625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11662346869707108,
      "learning_rate": 2.552631578947369e-06,
      "loss": 0.0,
      "num_tokens": 2192024.0,
      "reward": 0.8113000392913818,
      "reward_std": 0.23103395104408264,
      "rewards/reward_func/mean": 0.8113000392913818,
      "rewards/reward_func/std": 0.32786741852760315,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.733765222132206,
      "epoch": 0.912109375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13307461142539978,
      "learning_rate": 2.5473684210526317e-06,
      "loss": -0.0,
      "num_tokens": 2197040.0,
      "reward": 0.11410000175237656,
      "reward_std": 0.2282000035047531,
      "rewards/reward_func/mean": 0.11410000175237656,
      "rewards/reward_func/std": 0.3227235674858093,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5777375996112823,
      "epoch": 0.9140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2451069951057434,
      "learning_rate": 2.5421052631578953e-06,
      "loss": 0.0,
      "num_tokens": 2201888.0,
      "reward": 0.58160001039505,
      "reward_std": 0.5065810680389404,
      "rewards/reward_func/mean": 0.58160001039505,
      "rewards/reward_func/std": 0.4829333424568176,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5028033722192049,
      "epoch": 0.916015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23731045424938202,
      "learning_rate": 2.536842105263158e-06,
      "loss": 0.0,
      "num_tokens": 2206744.0,
      "reward": 0.7462999820709229,
      "reward_std": 0.4641999900341034,
      "rewards/reward_func/mean": 0.7462999820709229,
      "rewards/reward_func/std": 0.4297657012939453,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49811795353889465,
      "epoch": 0.91796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20878010988235474,
      "learning_rate": 2.5315789473684212e-06,
      "loss": 0.0,
      "num_tokens": 2211416.0,
      "reward": 0.7425000071525574,
      "reward_std": 0.30367511510849,
      "rewards/reward_func/mean": 0.7425000071525574,
      "rewards/reward_func/std": 0.4587482810020447,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.36679044365882874,
      "epoch": 0.919921875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12484736740589142,
      "learning_rate": 2.5263157894736844e-06,
      "loss": -0.0,
      "num_tokens": 2215912.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5753449611365795,
      "epoch": 0.921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23447397351264954,
      "learning_rate": 2.5210526315789475e-06,
      "loss": 0.0,
      "num_tokens": 2220568.0,
      "reward": 0.38792502880096436,
      "reward_std": 0.5182595252990723,
      "rewards/reward_func/mean": 0.38792499899864197,
      "rewards/reward_func/std": 0.5017743706703186,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4622494773939252,
      "epoch": 0.923828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25002261996269226,
      "learning_rate": 2.5157894736842107e-06,
      "loss": 0.0,
      "num_tokens": 2225352.0,
      "reward": 0.249937504529953,
      "reward_std": 0.47128826379776,
      "rewards/reward_func/mean": 0.249937504529953,
      "rewards/reward_func/std": 0.4365290105342865,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5469172280281782,
      "epoch": 0.92578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20600056648254395,
      "learning_rate": 2.510526315789474e-06,
      "loss": 0.0,
      "num_tokens": 2230336.0,
      "reward": 0.22462499141693115,
      "reward_std": 0.44099122285842896,
      "rewards/reward_func/mean": 0.22462499141693115,
      "rewards/reward_func/std": 0.4093196392059326,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6319026313722134,
      "epoch": 0.927734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3944365382194519,
      "learning_rate": 2.5052631578947375e-06,
      "loss": 0.0,
      "num_tokens": 2235096.0,
      "reward": 0.5051125288009644,
      "reward_std": 0.4934867322444916,
      "rewards/reward_func/mean": 0.5051125288009644,
      "rewards/reward_func/std": 0.5237316489219666,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.34385241754353046,
      "epoch": 0.9296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18634098768234253,
      "learning_rate": 2.5e-06,
      "loss": 0.0,
      "num_tokens": 2240048.0,
      "reward": 0.4440000057220459,
      "reward_std": 0.5151644945144653,
      "rewards/reward_func/mean": 0.4440000057220459,
      "rewards/reward_func/std": 0.47747913002967834,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5607694983482361,
      "epoch": 0.931640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19195789098739624,
      "learning_rate": 2.4947368421052634e-06,
      "loss": -0.0,
      "num_tokens": 2244704.0,
      "reward": 0.6195999979972839,
      "reward_std": 0.25623539090156555,
      "rewards/reward_func/mean": 0.6195999979972839,
      "rewards/reward_func/std": 0.513155460357666,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6188374534249306,
      "epoch": 0.93359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19206063449382782,
      "learning_rate": 2.4894736842105265e-06,
      "loss": 0.0,
      "num_tokens": 2249352.0,
      "reward": 0.7438000440597534,
      "reward_std": 0.4960067868232727,
      "rewards/reward_func/mean": 0.7437999844551086,
      "rewards/reward_func/std": 0.45921406149864197,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7727699540555477,
      "epoch": 0.935546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24975650012493134,
      "learning_rate": 2.4842105263157897e-06,
      "loss": -0.0,
      "num_tokens": 2254096.0,
      "reward": 0.7468999624252319,
      "reward_std": 0.46515488624572754,
      "rewards/reward_func/mean": 0.7468999624252319,
      "rewards/reward_func/std": 0.43068981170654297,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5285328812897205,
      "epoch": 0.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18033409118652344,
      "learning_rate": 2.478947368421053e-06,
      "loss": 0.0,
      "num_tokens": 2259064.0,
      "reward": 0.22200000286102295,
      "reward_std": 0.4440000057220459,
      "rewards/reward_func/mean": 0.22200000286102295,
      "rewards/reward_func/std": 0.41106414794921875,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6708570122718811,
      "epoch": 0.939453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23435814678668976,
      "learning_rate": 2.473684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2263720.0,
      "reward": 0.3711249828338623,
      "reward_std": 0.5285789370536804,
      "rewards/reward_func/mean": 0.3711249828338623,
      "rewards/reward_func/std": 0.5054450631141663,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6528993677347898,
      "epoch": 0.94140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23905879259109497,
      "learning_rate": 2.468421052631579e-06,
      "loss": -0.0,
      "num_tokens": 2268472.0,
      "reward": 0.6135500073432922,
      "reward_std": 0.5113159418106079,
      "rewards/reward_func/mean": 0.6135500073432922,
      "rewards/reward_func/std": 0.4950350224971771,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47676665522158146,
      "epoch": 0.943359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.145980104804039,
      "learning_rate": 2.4631578947368424e-06,
      "loss": 0.0,
      "num_tokens": 2273312.0,
      "reward": 0.8654749989509583,
      "reward_std": 0.22585000097751617,
      "rewards/reward_func/mean": 0.8654749989509583,
      "rewards/reward_func/std": 0.3194001615047455,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5406187996268272,
      "epoch": 0.9453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23957812786102295,
      "learning_rate": 2.4578947368421055e-06,
      "loss": 0.0,
      "num_tokens": 2278288.0,
      "reward": 0.2150000035762787,
      "reward_std": 0.4300000071525574,
      "rewards/reward_func/mean": 0.2150000035762787,
      "rewards/reward_func/std": 0.39838388562202454,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5646249614655972,
      "epoch": 0.947265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2331782728433609,
      "learning_rate": 2.4526315789473687e-06,
      "loss": 0.0,
      "num_tokens": 2283248.0,
      "reward": 0.11626250296831131,
      "reward_std": 0.2122942954301834,
      "rewards/reward_func/mean": 0.11626250296831131,
      "rewards/reward_func/std": 0.28399014472961426,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5729491971433163,
      "epoch": 0.94921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22905918955802917,
      "learning_rate": 2.447368421052632e-06,
      "loss": 0.0,
      "num_tokens": 2287904.0,
      "reward": 0.49886250495910645,
      "reward_std": 0.4972279667854309,
      "rewards/reward_func/mean": 0.49886250495910645,
      "rewards/reward_func/std": 0.5300286412239075,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6615194007754326,
      "epoch": 0.951171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26248008012771606,
      "learning_rate": 2.442105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2292752.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5113859847187996,
      "epoch": 0.953125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14094099402427673,
      "learning_rate": 2.436842105263158e-06,
      "loss": -0.0,
      "num_tokens": 2297416.0,
      "reward": 0.754687488079071,
      "reward_std": 0.2833658754825592,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6214317940175533,
      "epoch": 0.955078125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16864041984081268,
      "learning_rate": 2.4315789473684213e-06,
      "loss": 0.0,
      "num_tokens": 2301776.0,
      "reward": 0.6328125,
      "reward_std": 0.24523451924324036,
      "rewards/reward_func/mean": 0.6328125,
      "rewards/reward_func/std": 0.5071338415145874,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5048791384324431,
      "epoch": 0.95703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13176527619361877,
      "learning_rate": 2.4263157894736845e-06,
      "loss": -0.0,
      "num_tokens": 2306272.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8380111455917358,
      "epoch": 0.958984375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1738596111536026,
      "learning_rate": 2.4210526315789477e-06,
      "loss": -0.0,
      "num_tokens": 2310752.0,
      "reward": 0.754687488079071,
      "reward_std": 0.2833658754825592,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3626231402158737,
      "epoch": 0.9609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1840285211801529,
      "learning_rate": 2.415789473684211e-06,
      "loss": -0.0,
      "num_tokens": 2315256.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.46480000019073486,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7078562341630459,
      "epoch": 0.962890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20519906282424927,
      "learning_rate": 2.410526315789474e-06,
      "loss": -0.0,
      "num_tokens": 2320008.0,
      "reward": 0.488349974155426,
      "reward_std": 0.4877285361289978,
      "rewards/reward_func/mean": 0.4883500039577484,
      "rewards/reward_func/std": 0.52220219373703,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5425986461341381,
      "epoch": 0.96484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2026311159133911,
      "learning_rate": 2.4052631578947367e-06,
      "loss": 0.0,
      "num_tokens": 2324960.0,
      "reward": 0.24543750286102295,
      "reward_std": 0.2619554400444031,
      "rewards/reward_func/mean": 0.24543750286102295,
      "rewards/reward_func/std": 0.3972027599811554,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.54656021296978,
      "epoch": 0.966796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2986944913864136,
      "learning_rate": 2.4000000000000003e-06,
      "loss": 0.0,
      "num_tokens": 2329616.0,
      "reward": 0.26292499899864197,
      "reward_std": 0.4851202964782715,
      "rewards/reward_func/mean": 0.26292499899864197,
      "rewards/reward_func/std": 0.4491825997829437,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8450591489672661,
      "epoch": 0.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36952802538871765,
      "learning_rate": 2.3947368421052635e-06,
      "loss": 0.0,
      "num_tokens": 2334088.0,
      "reward": 0.6332374811172485,
      "reward_std": 0.5205224752426147,
      "rewards/reward_func/mean": 0.6332374811172485,
      "rewards/reward_func/std": 0.5004647970199585,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6656617224216461,
      "epoch": 0.970703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19458124041557312,
      "learning_rate": 2.3894736842105266e-06,
      "loss": 0.0,
      "num_tokens": 2339056.0,
      "reward": 0.3577499985694885,
      "reward_std": 0.46434903144836426,
      "rewards/reward_func/mean": 0.3577499985694885,
      "rewards/reward_func/std": 0.4540364146232605,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6396152228116989,
      "epoch": 0.97265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25116077065467834,
      "learning_rate": 2.38421052631579e-06,
      "loss": -0.0,
      "num_tokens": 2343704.0,
      "reward": 0.37542498111724854,
      "reward_std": 0.5320296287536621,
      "rewards/reward_func/mean": 0.3754250109195709,
      "rewards/reward_func/std": 0.5113484263420105,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4765377175062895,
      "epoch": 0.974609375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15191896259784698,
      "learning_rate": 2.378947368421053e-06,
      "loss": 0.0,
      "num_tokens": 2348496.0,
      "reward": 0.23672500252723694,
      "reward_std": 0.26632454991340637,
      "rewards/reward_func/mean": 0.23672500252723694,
      "rewards/reward_func/std": 0.43085548281669617,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7641369961202145,
      "epoch": 0.9765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2522798776626587,
      "learning_rate": 2.373684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2353504.0,
      "reward": 0.475600004196167,
      "reward_std": 0.5505199432373047,
      "rewards/reward_func/mean": 0.475600004196167,
      "rewards/reward_func/std": 0.5098142623901367,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5594486184418201,
      "epoch": 0.978515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24103155732154846,
      "learning_rate": 2.368421052631579e-06,
      "loss": -0.0,
      "num_tokens": 2358360.0,
      "reward": 0.8697500228881836,
      "reward_std": 0.2605000138282776,
      "rewards/reward_func/mean": 0.8697500228881836,
      "rewards/reward_func/std": 0.3517392575740814,
      "step": 501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5576187632977962,
      "epoch": 0.98046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.22993963956832886,
      "learning_rate": 2.363157894736842e-06,
      "loss": 0.0,
      "num_tokens": 2362984.0,
      "reward": 0.878125011920929,
      "reward_std": 0.24375002086162567,
      "rewards/reward_func/mean": 0.878125011920929,
      "rewards/reward_func/std": 0.34471458196640015,
      "step": 502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7988773807883263,
      "epoch": 0.982421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25204092264175415,
      "learning_rate": 2.357894736842105e-06,
      "loss": -0.0,
      "num_tokens": 2367472.0,
      "reward": 0.2640625238418579,
      "reward_std": 0.49094337224960327,
      "rewards/reward_func/mean": 0.2640624940395355,
      "rewards/reward_func/std": 0.45456206798553467,
      "step": 503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5971827898174524,
      "epoch": 0.984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21355661749839783,
      "learning_rate": 2.352631578947369e-06,
      "loss": 0.0,
      "num_tokens": 2372120.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.4982522130012512,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5676711704581976,
      "epoch": 0.986328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.37902572751045227,
      "learning_rate": 2.347368421052632e-06,
      "loss": 0.0,
      "num_tokens": 2377088.0,
      "reward": 0.3488750159740448,
      "reward_std": 0.4752463102340698,
      "rewards/reward_func/mean": 0.3488750159740448,
      "rewards/reward_func/std": 0.4613319933414459,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7709695436060429,
      "epoch": 0.98828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24499008059501648,
      "learning_rate": 2.342105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2381888.0,
      "reward": 0.7379875183105469,
      "reward_std": 0.4632129669189453,
      "rewards/reward_func/mean": 0.7379875183105469,
      "rewards/reward_func/std": 0.42976444959640503,
      "step": 506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4376491168513894,
      "epoch": 0.990234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22884558141231537,
      "learning_rate": 2.3368421052631583e-06,
      "loss": 0.0,
      "num_tokens": 2386376.0,
      "reward": 0.46480000019073486,
      "reward_std": 0.5367048382759094,
      "rewards/reward_func/mean": 0.46480000019073486,
      "rewards/reward_func/std": 0.49689212441444397,
      "step": 507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8945914134383202,
      "epoch": 0.9921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24041084945201874,
      "learning_rate": 2.331578947368421e-06,
      "loss": 0.0,
      "num_tokens": 2390872.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.49611565470695496,
      "rewards/reward_func/mean": 0.5062500238418579,
      "rewards/reward_func/std": 0.5280946493148804,
      "step": 508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8071067947894335,
      "epoch": 0.994140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25361520051956177,
      "learning_rate": 2.326315789473684e-06,
      "loss": 0.0,
      "num_tokens": 2395344.0,
      "reward": 0.507099986076355,
      "reward_std": 0.5571578741073608,
      "rewards/reward_func/mean": 0.507099986076355,
      "rewards/reward_func/std": 0.5159706473350525,
      "step": 509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.576714625582099,
      "epoch": 0.99609375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13069601356983185,
      "learning_rate": 2.3210526315789473e-06,
      "loss": 0.0,
      "num_tokens": 2400304.0,
      "reward": 0.11100000143051147,
      "reward_std": 0.22200000286102295,
      "rewards/reward_func/mean": 0.11100000143051147,
      "rewards/reward_func/std": 0.3139553964138031,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5591612830758095,
      "epoch": 0.998046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16368049383163452,
      "learning_rate": 2.3157894736842105e-06,
      "loss": 0.0,
      "num_tokens": 2405088.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/reward_func/mean": 0.0031250000465661287,
      "rewards/reward_func/std": 0.008838835172355175,
      "step": 511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7550145424902439,
      "epoch": 1.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2605975568294525,
      "learning_rate": 2.310526315789474e-06,
      "loss": 0.0,
      "num_tokens": 2409848.0,
      "reward": 0.36937499046325684,
      "reward_std": 0.5097146034240723,
      "rewards/reward_func/mean": 0.36937499046325684,
      "rewards/reward_func/std": 0.49009066820144653,
      "step": 512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6688739284873009,
      "epoch": 1.001953125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16578584909439087,
      "learning_rate": 2.3052631578947373e-06,
      "loss": 0.0,
      "num_tokens": 2414288.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7435273714363575,
      "epoch": 1.00390625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.3000000000000004e-06,
      "loss": 0.0,
      "num_tokens": 2418728.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6509594097733498,
      "epoch": 1.005859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16100262105464935,
      "learning_rate": 2.294736842105263e-06,
      "loss": -0.0,
      "num_tokens": 2423488.0,
      "reward": 0.8224999904632568,
      "reward_std": 0.23499999940395355,
      "rewards/reward_func/mean": 0.8224999904632568,
      "rewards/reward_func/std": 0.33234018087387085,
      "step": 515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6421031467616558,
      "epoch": 1.0078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20752696692943573,
      "learning_rate": 2.2894736842105263e-06,
      "loss": 0.0,
      "num_tokens": 2428328.0,
      "reward": 0.3933500051498413,
      "reward_std": 0.24158133566379547,
      "rewards/reward_func/mean": 0.3933500051498413,
      "rewards/reward_func/std": 0.4744868576526642,
      "step": 516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.2608198570087552,
      "epoch": 1.009765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09608624875545502,
      "learning_rate": 2.2842105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2432816.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.2683524191379547,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7466496359556913,
      "epoch": 1.01171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2451416254043579,
      "learning_rate": 2.2789473684210527e-06,
      "loss": -0.0,
      "num_tokens": 2437616.0,
      "reward": 0.8708249926567078,
      "reward_std": 0.2384064942598343,
      "rewards/reward_func/mean": 0.8708249926567078,
      "rewards/reward_func/std": 0.3219219446182251,
      "step": 518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6013193130493164,
      "epoch": 1.013671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20224137604236603,
      "learning_rate": 2.273684210526316e-06,
      "loss": -0.0,
      "num_tokens": 2442272.0,
      "reward": 0.36959999799728394,
      "reward_std": 0.5324397087097168,
      "rewards/reward_func/mean": 0.36959999799728394,
      "rewards/reward_func/std": 0.5101400017738342,
      "step": 519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7230842337012291,
      "epoch": 1.015625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16407498717308044,
      "learning_rate": 2.268421052631579e-06,
      "loss": 0.0,
      "num_tokens": 2447272.0,
      "reward": 0.11680000275373459,
      "reward_std": 0.23360000550746918,
      "rewards/reward_func/mean": 0.11680000275373459,
      "rewards/reward_func/std": 0.3303602933883667,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.39533343352377415,
      "epoch": 1.017578125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.2631578947368426e-06,
      "loss": 0.0,
      "num_tokens": 2451776.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6934016719460487,
      "epoch": 1.01953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20798802375793457,
      "learning_rate": 2.2578947368421057e-06,
      "loss": 0.0,
      "num_tokens": 2456392.0,
      "reward": 0.7335000038146973,
      "reward_std": 0.48899999260902405,
      "rewards/reward_func/mean": 0.7335000038146973,
      "rewards/reward_func/std": 0.4527260363101959,
      "step": 522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49564963206648827,
      "epoch": 1.021484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1638687700033188,
      "learning_rate": 2.2526315789473685e-06,
      "loss": 0.0,
      "num_tokens": 2461176.0,
      "reward": 0.23899999260902405,
      "reward_std": 0.27597343921661377,
      "rewards/reward_func/mean": 0.23899999260902405,
      "rewards/reward_func/std": 0.44254201650619507,
      "step": 523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4542197324335575,
      "epoch": 1.0234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18925493955612183,
      "learning_rate": 2.2473684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2465840.0,
      "reward": 0.7464874982833862,
      "reward_std": 0.2938106656074524,
      "rewards/reward_func/mean": 0.7464874982833862,
      "rewards/reward_func/std": 0.449546217918396,
      "step": 524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7139078304171562,
      "epoch": 1.025390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23500654101371765,
      "learning_rate": 2.242105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2470688.0,
      "reward": 0.3981500267982483,
      "reward_std": 0.49756962060928345,
      "rewards/reward_func/mean": 0.3981499969959259,
      "rewards/reward_func/std": 0.4812353849411011,
      "step": 525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8366854265332222,
      "epoch": 1.02734375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18551307916641235,
      "learning_rate": 2.236842105263158e-06,
      "loss": 0.0,
      "num_tokens": 2475448.0,
      "reward": 0.7081249952316284,
      "reward_std": 0.26779481768608093,
      "rewards/reward_func/mean": 0.7081249952316284,
      "rewards/reward_func/std": 0.42940106987953186,
      "step": 526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6392475813627243,
      "epoch": 1.029296875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.231578947368421e-06,
      "loss": 0.0,
      "num_tokens": 2479800.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7336869612336159,
      "epoch": 1.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2812504172325134,
      "learning_rate": 2.2263157894736843e-06,
      "loss": 0.0,
      "num_tokens": 2484304.0,
      "reward": 0.6352249979972839,
      "reward_std": 0.5144467949867249,
      "rewards/reward_func/mean": 0.6352249979972839,
      "rewards/reward_func/std": 0.4920179843902588,
      "step": 528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6500775553286076,
      "epoch": 1.033203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2616865932941437,
      "learning_rate": 2.221052631578948e-06,
      "loss": -0.0,
      "num_tokens": 2489104.0,
      "reward": 0.2381249964237213,
      "reward_std": 0.27268749475479126,
      "rewards/reward_func/mean": 0.2381249964237213,
      "rewards/reward_func/std": 0.43324100971221924,
      "step": 529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6904492862522602,
      "epoch": 1.03515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20736540853977203,
      "learning_rate": 2.2157894736842106e-06,
      "loss": 0.0,
      "num_tokens": 2494112.0,
      "reward": 0.350600004196167,
      "reward_std": 0.5038028955459595,
      "rewards/reward_func/mean": 0.350600004196167,
      "rewards/reward_func/std": 0.48401686549186707,
      "step": 530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8729193657636642,
      "epoch": 1.037109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2569698393344879,
      "learning_rate": 2.2105263157894738e-06,
      "loss": 0.0,
      "num_tokens": 2498864.0,
      "reward": 0.4913625121116638,
      "reward_std": 0.5638903379440308,
      "rewards/reward_func/mean": 0.4913625121116638,
      "rewards/reward_func/std": 0.5222924947738647,
      "step": 531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9219260476529598,
      "epoch": 1.0390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22864028811454773,
      "learning_rate": 2.205263157894737e-06,
      "loss": -0.0,
      "num_tokens": 2503656.0,
      "reward": 0.7295999526977539,
      "reward_std": 0.300462931394577,
      "rewards/reward_func/mean": 0.7296000123023987,
      "rewards/reward_func/std": 0.4509044587612152,
      "step": 532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.0916531383991241,
      "epoch": 1.041015625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.2035861611366272,
      "learning_rate": 2.2e-06,
      "loss": 0.0,
      "num_tokens": 2508128.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47885562665760517,
      "epoch": 1.04296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19585774838924408,
      "learning_rate": 2.1947368421052633e-06,
      "loss": 0.0,
      "num_tokens": 2513104.0,
      "reward": 0.5544999837875366,
      "reward_std": 0.4817962646484375,
      "rewards/reward_func/mean": 0.5544999837875366,
      "rewards/reward_func/std": 0.4600767195224762,
      "step": 534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.2929891608655453,
      "epoch": 1.044921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18502745032310486,
      "learning_rate": 2.1894736842105264e-06,
      "loss": -0.0,
      "num_tokens": 2517608.0,
      "reward": 0.7003250122070312,
      "reward_std": 0.45855000615119934,
      "rewards/reward_func/mean": 0.7003250122070312,
      "rewards/reward_func/std": 0.4245873987674713,
      "step": 535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5622880086302757,
      "epoch": 1.046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19996067881584167,
      "learning_rate": 2.1842105263157896e-06,
      "loss": 0.0,
      "num_tokens": 2522272.0,
      "reward": 0.49806249141693115,
      "reward_std": 0.5715733170509338,
      "rewards/reward_func/mean": 0.49806249141693115,
      "rewards/reward_func/std": 0.5292056798934937,
      "step": 536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.43428971618413925,
      "epoch": 1.048828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1537165641784668,
      "learning_rate": 2.1789473684210528e-06,
      "loss": -0.0,
      "num_tokens": 2526936.0,
      "reward": 0.12306249886751175,
      "reward_std": 0.2461249977350235,
      "rewards/reward_func/mean": 0.12306249886751175,
      "rewards/reward_func/std": 0.34305045008659363,
      "step": 537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7195999212563038,
      "epoch": 1.05078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2509689927101135,
      "learning_rate": 2.173684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2531464.0,
      "reward": 0.6269875168800354,
      "reward_std": 0.5275543928146362,
      "rewards/reward_func/mean": 0.6269874572753906,
      "rewards/reward_func/std": 0.5090279579162598,
      "step": 538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7064641937613487,
      "epoch": 1.052734375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.168421052631579e-06,
      "loss": 0.0,
      "num_tokens": 2536328.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6143765226006508,
      "epoch": 1.0546875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.1631578947368423e-06,
      "loss": 0.0,
      "num_tokens": 2541080.0,
      "reward": 0.9399999976158142,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9399999976158142,
      "rewards/reward_func/std": 0.0,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4053959520533681,
      "epoch": 1.056640625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11423177272081375,
      "learning_rate": 2.1578947368421054e-06,
      "loss": -0.0,
      "num_tokens": 2545576.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8161136545240879,
      "epoch": 1.05859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2826439142227173,
      "learning_rate": 2.1526315789473686e-06,
      "loss": 0.0,
      "num_tokens": 2550104.0,
      "reward": 0.7566750049591064,
      "reward_std": 0.479504257440567,
      "rewards/reward_func/mean": 0.7566750049591064,
      "rewards/reward_func/std": 0.4439469873905182,
      "step": 542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7691904827952385,
      "epoch": 1.060546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24437452852725983,
      "learning_rate": 2.1473684210526317e-06,
      "loss": -0.0,
      "num_tokens": 2554888.0,
      "reward": 0.8523874878883362,
      "reward_std": 0.23936279118061066,
      "rewards/reward_func/mean": 0.8523874878883362,
      "rewards/reward_func/std": 0.32976391911506653,
      "step": 543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.33484411239624023,
      "epoch": 1.0625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12194463610649109,
      "learning_rate": 2.142105263157895e-06,
      "loss": -0.0,
      "num_tokens": 2559392.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5565262641757727,
      "epoch": 1.064453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20515061914920807,
      "learning_rate": 2.136842105263158e-06,
      "loss": -0.0,
      "num_tokens": 2564152.0,
      "reward": 0.3663124740123749,
      "reward_std": 0.5069268345832825,
      "rewards/reward_func/mean": 0.36631250381469727,
      "rewards/reward_func/std": 0.4885818362236023,
      "step": 545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3899686820805073,
      "epoch": 1.06640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19220681488513947,
      "learning_rate": 2.1315789473684212e-06,
      "loss": -0.0,
      "num_tokens": 2569088.0,
      "reward": 0.117249995470047,
      "reward_std": 0.2220594733953476,
      "rewards/reward_func/mean": 0.117249995470047,
      "rewards/reward_func/std": 0.3115631341934204,
      "step": 546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5984954424202442,
      "epoch": 1.068359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1464897096157074,
      "learning_rate": 2.1263157894736844e-06,
      "loss": 0.0,
      "num_tokens": 2574056.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.010825317353010178,
      "rewards/reward_func/mean": 0.00937500037252903,
      "rewards/reward_func/std": 0.01735912822186947,
      "step": 547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6083435006439686,
      "epoch": 1.0703125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.1210526315789476e-06,
      "loss": 0.0,
      "num_tokens": 2578800.0,
      "reward": 0.9399999976158142,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9399999976158142,
      "rewards/reward_func/std": 0.0,
      "step": 548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5468325912952423,
      "epoch": 1.072265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21403901278972626,
      "learning_rate": 2.1157894736842107e-06,
      "loss": 0.0,
      "num_tokens": 2583544.0,
      "reward": 0.5962874889373779,
      "reward_std": 0.5044821500778198,
      "rewards/reward_func/mean": 0.5962875485420227,
      "rewards/reward_func/std": 0.4841693639755249,
      "step": 549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6861546561121941,
      "epoch": 1.07421875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18041720986366272,
      "learning_rate": 2.110526315789474e-06,
      "loss": 0.0,
      "num_tokens": 2587976.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7639718391001225,
      "epoch": 1.076171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23512768745422363,
      "learning_rate": 2.105263157894737e-06,
      "loss": 0.0,
      "num_tokens": 2592728.0,
      "reward": 0.7156250476837158,
      "reward_std": 0.2827948331832886,
      "rewards/reward_func/mean": 0.715624988079071,
      "rewards/reward_func/std": 0.4345230460166931,
      "step": 551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4462539367377758,
      "epoch": 1.078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1933043748140335,
      "learning_rate": 2.1000000000000002e-06,
      "loss": -0.0,
      "num_tokens": 2597528.0,
      "reward": 0.2436874806880951,
      "reward_std": 0.47496649622917175,
      "rewards/reward_func/mean": 0.2436874955892563,
      "rewards/reward_func/std": 0.4397376477718353,
      "step": 552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6553896479308605,
      "epoch": 1.080078125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16326484084129333,
      "learning_rate": 2.0947368421052634e-06,
      "loss": 0.0,
      "num_tokens": 2602032.0,
      "reward": 0.8695999979972839,
      "reward_std": 0.24645258486270905,
      "rewards/reward_func/mean": 0.8695999979972839,
      "rewards/reward_func/std": 0.35150691866874695,
      "step": 553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5182516658678651,
      "epoch": 1.08203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18343044817447662,
      "learning_rate": 2.0894736842105266e-06,
      "loss": 0.0,
      "num_tokens": 2606832.0,
      "reward": 0.11836250126361847,
      "reward_std": 0.23672500252723694,
      "rewards/reward_func/mean": 0.11836250126361847,
      "rewards/reward_func/std": 0.3297579884529114,
      "step": 554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7340024933218956,
      "epoch": 1.083984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24913115799427032,
      "learning_rate": 2.0842105263157897e-06,
      "loss": -0.0,
      "num_tokens": 2611360.0,
      "reward": 0.9920499920845032,
      "reward_std": 0.015900004655122757,
      "rewards/reward_func/mean": 0.9920499920845032,
      "rewards/reward_func/std": 0.015697769820690155,
      "step": 555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6129337772727013,
      "epoch": 1.0859375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.078947368421053e-06,
      "loss": 0.0,
      "num_tokens": 2615704.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6931223534047604,
      "epoch": 1.087890625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 2.073684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2620352.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7478494979441166,
      "epoch": 1.08984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24718572199344635,
      "learning_rate": 2.068421052631579e-06,
      "loss": 0.0,
      "num_tokens": 2625104.0,
      "reward": 0.9848999977111816,
      "reward_std": 0.013399453833699226,
      "rewards/reward_func/mean": 0.9848999977111816,
      "rewards/reward_func/std": 0.012781685218214989,
      "step": 558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9691029265522957,
      "epoch": 1.091796875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18628694117069244,
      "learning_rate": 2.0631578947368424e-06,
      "loss": -0.0,
      "num_tokens": 2629592.0,
      "reward": 0.754687488079071,
      "reward_std": 0.2833658754825592,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5964610986411572,
      "epoch": 1.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20570731163024902,
      "learning_rate": 2.0578947368421055e-06,
      "loss": -0.0,
      "num_tokens": 2634240.0,
      "reward": 0.24886250495910645,
      "reward_std": 0.28871649503707886,
      "rewards/reward_func/mean": 0.24886250495910645,
      "rewards/reward_func/std": 0.45700305700302124,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6072207950055599,
      "epoch": 1.095703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2261851280927658,
      "learning_rate": 2.0526315789473687e-06,
      "loss": 0.0,
      "num_tokens": 2638904.0,
      "reward": 0.6188000440597534,
      "reward_std": 0.5333460569381714,
      "rewards/reward_func/mean": 0.6187999844551086,
      "rewards/reward_func/std": 0.5125207901000977,
      "step": 561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6318282932043076,
      "epoch": 1.09765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21229666471481323,
      "learning_rate": 2.047368421052632e-06,
      "loss": -0.0,
      "num_tokens": 2643640.0,
      "reward": 0.5874999761581421,
      "reward_std": 0.5063546299934387,
      "rewards/reward_func/mean": 0.5874999761581421,
      "rewards/reward_func/std": 0.486496239900589,
      "step": 562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6489912085235119,
      "epoch": 1.099609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25839194655418396,
      "learning_rate": 2.042105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2648504.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8778778314590454,
      "epoch": 1.1015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25570207834243774,
      "learning_rate": 2.036842105263158e-06,
      "loss": 0.0,
      "num_tokens": 2653248.0,
      "reward": 0.5106500387191772,
      "reward_std": 0.5463833808898926,
      "rewards/reward_func/mean": 0.5106500387191772,
      "rewards/reward_func/std": 0.5066616535186768,
      "step": 564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6692373007535934,
      "epoch": 1.103515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20203210413455963,
      "learning_rate": 2.031578947368421e-06,
      "loss": -0.0,
      "num_tokens": 2658240.0,
      "reward": 0.23606249690055847,
      "reward_std": 0.4352036714553833,
      "rewards/reward_func/mean": 0.23606249690055847,
      "rewards/reward_func/std": 0.402925044298172,
      "step": 565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48891974054276943,
      "epoch": 1.10546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2103108912706375,
      "learning_rate": 2.026315789473684e-06,
      "loss": 0.0,
      "num_tokens": 2663024.0,
      "reward": 0.24681249260902405,
      "reward_std": 0.2817399799823761,
      "rewards/reward_func/mean": 0.24681249260902405,
      "rewards/reward_func/std": 0.43789422512054443,
      "step": 566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.22187136486172676,
      "epoch": 1.107421875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11180789768695831,
      "learning_rate": 2.0210526315789477e-06,
      "loss": 0.0,
      "num_tokens": 2667528.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.2683524191379547,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6616909764707088,
      "epoch": 1.109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22444318234920502,
      "learning_rate": 2.015789473684211e-06,
      "loss": 0.0,
      "num_tokens": 2672320.0,
      "reward": 0.736412525177002,
      "reward_std": 0.4869075417518616,
      "rewards/reward_func/mean": 0.736412525177002,
      "rewards/reward_func/std": 0.4509141147136688,
      "step": 568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5226791556924582,
      "epoch": 1.111328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25233376026153564,
      "learning_rate": 2.010526315789474e-06,
      "loss": -0.0,
      "num_tokens": 2677112.0,
      "reward": 0.3647499680519104,
      "reward_std": 0.5094026923179626,
      "rewards/reward_func/mean": 0.3647499680519104,
      "rewards/reward_func/std": 0.4897109270095825,
      "step": 569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7138075400143862,
      "epoch": 1.11328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22373870015144348,
      "learning_rate": 2.005263157894737e-06,
      "loss": 0.0,
      "num_tokens": 2681760.0,
      "reward": 0.49729999899864197,
      "reward_std": 0.5742666125297546,
      "rewards/reward_func/mean": 0.49729999899864197,
      "rewards/reward_func/std": 0.531683087348938,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49137804098427296,
      "epoch": 1.115234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1849352866411209,
      "learning_rate": 2.0000000000000003e-06,
      "loss": 0.0,
      "num_tokens": 2686720.0,
      "reward": 0.015625,
      "reward_std": 0.025966878980398178,
      "rewards/reward_func/mean": 0.015625,
      "rewards/reward_func/std": 0.0265165064483881,
      "step": 571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.0584421530365944,
      "epoch": 1.1171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32097047567367554,
      "learning_rate": 1.994736842105263e-06,
      "loss": 0.0,
      "num_tokens": 2691216.0,
      "reward": 0.6211625337600708,
      "reward_std": 0.5293147563934326,
      "rewards/reward_func/mean": 0.6211625337600708,
      "rewards/reward_func/std": 0.511013925075531,
      "step": 572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5815026629716158,
      "epoch": 1.119140625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1526409089565277,
      "learning_rate": 1.9894736842105262e-06,
      "loss": 0.0,
      "num_tokens": 2695880.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.731554638594389,
      "epoch": 1.12109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23003970086574554,
      "learning_rate": 1.9842105263157894e-06,
      "loss": 0.0,
      "num_tokens": 2700208.0,
      "reward": 0.754687488079071,
      "reward_std": 0.4906249940395355,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.44338976964354515,
      "epoch": 1.123046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1868741661310196,
      "learning_rate": 1.978947368421053e-06,
      "loss": -0.0,
      "num_tokens": 2704992.0,
      "reward": 0.3651750087738037,
      "reward_std": 0.5036294460296631,
      "rewards/reward_func/mean": 0.3651749789714813,
      "rewards/reward_func/std": 0.4835827052593231,
      "step": 575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5494178477674723,
      "epoch": 1.125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15477776527404785,
      "learning_rate": 1.973684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2709976.0,
      "reward": 0.3361250162124634,
      "reward_std": 0.21575000882148743,
      "rewards/reward_func/mean": 0.3361250162124634,
      "rewards/reward_func/std": 0.4570740759372711,
      "step": 576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.35547622572630644,
      "epoch": 1.126953125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10672120004892349,
      "learning_rate": 1.9684210526315793e-06,
      "loss": 0.0,
      "num_tokens": 2714472.0,
      "reward": 0.9275000095367432,
      "reward_std": 0.0041999914683401585,
      "rewards/reward_func/mean": 0.9275000095367432,
      "rewards/reward_func/std": 0.005939692724496126,
      "step": 577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6676280610263348,
      "epoch": 1.12890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21440531313419342,
      "learning_rate": 1.9631578947368425e-06,
      "loss": 0.0,
      "num_tokens": 2719128.0,
      "reward": 0.3832375109195709,
      "reward_std": 0.5218474268913269,
      "rewards/reward_func/mean": 0.3832375109195709,
      "rewards/reward_func/std": 0.5054107308387756,
      "step": 578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5016189366579056,
      "epoch": 1.130859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20237448811531067,
      "learning_rate": 1.9578947368421052e-06,
      "loss": 0.0,
      "num_tokens": 2723792.0,
      "reward": 0.6060000061988831,
      "reward_std": 0.2554674744606018,
      "rewards/reward_func/mean": 0.6060000061988831,
      "rewards/reward_func/std": 0.502031147480011,
      "step": 579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5959945041686296,
      "epoch": 1.1328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33901241421699524,
      "learning_rate": 1.9526315789473684e-06,
      "loss": 0.0,
      "num_tokens": 2728560.0,
      "reward": 0.3585500121116638,
      "reward_std": 0.5105670690536499,
      "rewards/reward_func/mean": 0.3585500121116638,
      "rewards/reward_func/std": 0.4953995645046234,
      "step": 580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.948068805038929,
      "epoch": 1.134765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3101772665977478,
      "learning_rate": 1.9473684210526315e-06,
      "loss": -0.0,
      "num_tokens": 2733064.0,
      "reward": 0.6379250288009644,
      "reward_std": 0.512575626373291,
      "rewards/reward_func/mean": 0.6379250288009644,
      "rewards/reward_func/std": 0.494179368019104,
      "step": 581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.46106173656880856,
      "epoch": 1.13671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18642479181289673,
      "learning_rate": 1.9421052631578947e-06,
      "loss": -0.0,
      "num_tokens": 2737984.0,
      "reward": 0.8615000247955322,
      "reward_std": 0.2508353888988495,
      "rewards/reward_func/mean": 0.8615000247955322,
      "rewards/reward_func/std": 0.3482353091239929,
      "step": 582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.27936943667009473,
      "epoch": 1.138671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1574077308177948,
      "learning_rate": 1.936842105263158e-06,
      "loss": 0.0,
      "num_tokens": 2742496.0,
      "reward": 0.6930000185966492,
      "reward_std": 0.4620679020881653,
      "rewards/reward_func/mean": 0.6929999589920044,
      "rewards/reward_func/std": 0.4277917444705963,
      "step": 583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9888619035482407,
      "epoch": 1.140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.266496479511261,
      "learning_rate": 1.9315789473684215e-06,
      "loss": 0.0,
      "num_tokens": 2747016.0,
      "reward": 0.864799976348877,
      "reward_std": 0.2641477882862091,
      "rewards/reward_func/mean": 0.864799976348877,
      "rewards/reward_func/std": 0.35006821155548096,
      "step": 584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7337736170738935,
      "epoch": 1.142578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23653343319892883,
      "learning_rate": 1.9263157894736846e-06,
      "loss": -0.0,
      "num_tokens": 2752000.0,
      "reward": 0.23918750882148743,
      "reward_std": 0.43328288197517395,
      "rewards/reward_func/mean": 0.23918750882148743,
      "rewards/reward_func/std": 0.4011473059654236,
      "step": 585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5843038037419319,
      "epoch": 1.14453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21472285687923431,
      "learning_rate": 1.9210526315789474e-06,
      "loss": 0.0,
      "num_tokens": 2756976.0,
      "reward": 0.3330000042915344,
      "reward_std": 0.47834351658821106,
      "rewards/reward_func/mean": 0.3330000042915344,
      "rewards/reward_func/std": 0.45958366990089417,
      "step": 586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.766981165856123,
      "epoch": 1.146484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.171160027384758,
      "learning_rate": 1.9157894736842105e-06,
      "loss": -0.0,
      "num_tokens": 2761464.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.28164324164390564,
      "rewards/reward_func/mean": 0.7562500238418579,
      "rewards/reward_func/std": 0.45153507590293884,
      "step": 587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5991237238049507,
      "epoch": 1.1484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23209227621555328,
      "learning_rate": 1.9105263157894737e-06,
      "loss": -0.0,
      "num_tokens": 2766240.0,
      "reward": 0.2514999806880951,
      "reward_std": 0.4700762629508972,
      "rewards/reward_func/mean": 0.2515000104904175,
      "rewards/reward_func/std": 0.4352715015411377,
      "step": 588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4802367826923728,
      "epoch": 1.150390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20853254199028015,
      "learning_rate": 1.905263157894737e-06,
      "loss": -0.0,
      "num_tokens": 2771008.0,
      "reward": 0.26012498140335083,
      "reward_std": 0.47928479313850403,
      "rewards/reward_func/mean": 0.26012498140335083,
      "rewards/reward_func/std": 0.44427764415740967,
      "step": 589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4567495686933398,
      "epoch": 1.15234375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1308472901582718,
      "learning_rate": 1.9000000000000002e-06,
      "loss": -0.0,
      "num_tokens": 2775512.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7382534444332123,
      "epoch": 1.154296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24290543794631958,
      "learning_rate": 1.8947368421052634e-06,
      "loss": 0.0,
      "num_tokens": 2780360.0,
      "reward": 0.23649999499320984,
      "reward_std": 0.4729999899864197,
      "rewards/reward_func/mean": 0.23649999499320984,
      "rewards/reward_func/std": 0.4379509389400482,
      "step": 591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8535390049219131,
      "epoch": 1.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2762669026851654,
      "learning_rate": 1.8894736842105266e-06,
      "loss": 0.0,
      "num_tokens": 2784888.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.4982522130012512,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7388596646487713,
      "epoch": 1.158203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2849021852016449,
      "learning_rate": 1.8842105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2789632.0,
      "reward": 0.7318500280380249,
      "reward_std": 0.48854929208755493,
      "rewards/reward_func/mean": 0.7318500280380249,
      "rewards/reward_func/std": 0.4523949921131134,
      "step": 593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5478004105389118,
      "epoch": 1.16015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21340344846248627,
      "learning_rate": 1.8789473684210527e-06,
      "loss": -0.0,
      "num_tokens": 2794568.0,
      "reward": 0.17698749899864197,
      "reward_std": 0.23321816325187683,
      "rewards/reward_func/mean": 0.17698749899864197,
      "rewards/reward_func/std": 0.3240951597690582,
      "step": 594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7563757076859474,
      "epoch": 1.162109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22627751529216766,
      "learning_rate": 1.8736842105263158e-06,
      "loss": 0.0,
      "num_tokens": 2799584.0,
      "reward": 0.24108749628067017,
      "reward_std": 0.46984440088272095,
      "rewards/reward_func/mean": 0.24108749628067017,
      "rewards/reward_func/std": 0.43506819009780884,
      "step": 595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8104111216962337,
      "epoch": 1.1640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.258118599653244,
      "learning_rate": 1.868421052631579e-06,
      "loss": 0.0,
      "num_tokens": 2804328.0,
      "reward": 0.8609625101089478,
      "reward_std": 0.2564750015735626,
      "rewards/reward_func/mean": 0.8609625101089478,
      "rewards/reward_func/std": 0.3434082269668579,
      "step": 596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.36982312239706516,
      "epoch": 1.166015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19538865983486176,
      "learning_rate": 1.8631578947368424e-06,
      "loss": -0.0,
      "num_tokens": 2808832.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.46480000019073486,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6775441057980061,
      "epoch": 1.16796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2285778522491455,
      "learning_rate": 1.8578947368421055e-06,
      "loss": 0.0,
      "num_tokens": 2813488.0,
      "reward": 0.4920499920845032,
      "reward_std": 0.5683344602584839,
      "rewards/reward_func/mean": 0.4920499920845032,
      "rewards/reward_func/std": 0.5261891484260559,
      "step": 598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5741296671330929,
      "epoch": 1.169921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20829658210277557,
      "learning_rate": 1.8526315789473687e-06,
      "loss": -0.0,
      "num_tokens": 2818272.0,
      "reward": 0.12574999034404755,
      "reward_std": 0.24323992431163788,
      "rewards/reward_func/mean": 0.12574999034404755,
      "rewards/reward_func/std": 0.33566170930862427,
      "step": 599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.2949846936389804,
      "epoch": 1.171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1604420691728592,
      "learning_rate": 1.8473684210526319e-06,
      "loss": -0.0,
      "num_tokens": 2822784.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.46480000019073486,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.843192458152771,
      "epoch": 1.173828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2168247401714325,
      "learning_rate": 1.8421052631578948e-06,
      "loss": 0.0,
      "num_tokens": 2827528.0,
      "reward": 0.7124999761581421,
      "reward_std": 0.47541630268096924,
      "rewards/reward_func/mean": 0.7124999761581421,
      "rewards/reward_func/std": 0.44025155901908875,
      "step": 601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7822651527822018,
      "epoch": 1.17578125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18856878578662872,
      "learning_rate": 1.836842105263158e-06,
      "loss": 0.0,
      "num_tokens": 2831880.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6440773196518421,
      "epoch": 1.177734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23340384662151337,
      "learning_rate": 1.8315789473684211e-06,
      "loss": -0.0,
      "num_tokens": 2836528.0,
      "reward": 0.7204999923706055,
      "reward_std": 0.481002539396286,
      "rewards/reward_func/mean": 0.7204999923706055,
      "rewards/reward_func/std": 0.4453295171260834,
      "step": 603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.576304204761982,
      "epoch": 1.1796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23746424913406372,
      "learning_rate": 1.8263157894736843e-06,
      "loss": -0.0,
      "num_tokens": 2841192.0,
      "reward": 0.24729999899864197,
      "reward_std": 0.49459999799728394,
      "rewards/reward_func/mean": 0.24729999899864197,
      "rewards/reward_func/std": 0.4579470157623291,
      "step": 604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8262333199381828,
      "epoch": 1.181640625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1838967502117157,
      "learning_rate": 1.8210526315789475e-06,
      "loss": 0.0,
      "num_tokens": 2846184.0,
      "reward": 0.11140000075101852,
      "reward_std": 0.22280000150203705,
      "rewards/reward_func/mean": 0.11140000075101852,
      "rewards/reward_func/std": 0.3150867819786072,
      "step": 605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7038808278739452,
      "epoch": 1.18359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22406964004039764,
      "learning_rate": 1.8157894736842109e-06,
      "loss": 0.0,
      "num_tokens": 2850784.0,
      "reward": 0.7308000326156616,
      "reward_std": 0.4872533679008484,
      "rewards/reward_func/mean": 0.7307999730110168,
      "rewards/reward_func/std": 0.45112112164497375,
      "step": 606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8283609077334404,
      "epoch": 1.185546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2470514476299286,
      "learning_rate": 1.810526315789474e-06,
      "loss": -0.0,
      "num_tokens": 2855608.0,
      "reward": 0.7521250247955322,
      "reward_std": 0.4598061442375183,
      "rewards/reward_func/mean": 0.7521250247955322,
      "rewards/reward_func/std": 0.4257110357284546,
      "step": 607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6406707651913166,
      "epoch": 1.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2593001425266266,
      "learning_rate": 1.805263157894737e-06,
      "loss": 0.0,
      "num_tokens": 2860792.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7551974728703499,
      "epoch": 1.189453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2569890022277832,
      "learning_rate": 1.8000000000000001e-06,
      "loss": 0.0,
      "num_tokens": 2865472.0,
      "reward": 0.635937511920929,
      "reward_std": 0.5168001055717468,
      "rewards/reward_func/mean": 0.635937511920929,
      "rewards/reward_func/std": 0.5031790733337402,
      "step": 609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8159989677369595,
      "epoch": 1.19140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2728988826274872,
      "learning_rate": 1.7947368421052633e-06,
      "loss": 0.0,
      "num_tokens": 2869952.0,
      "reward": 0.6223000288009644,
      "reward_std": 0.536927342414856,
      "rewards/reward_func/mean": 0.6223000288009644,
      "rewards/reward_func/std": 0.5153651237487793,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5178343709558249,
      "epoch": 1.193359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1404290795326233,
      "learning_rate": 1.7894736842105265e-06,
      "loss": 0.0,
      "num_tokens": 2874464.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.2683524191379547,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5767571702599525,
      "epoch": 1.1953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2455584704875946,
      "learning_rate": 1.7842105263157896e-06,
      "loss": -0.0,
      "num_tokens": 2879264.0,
      "reward": 0.7221124768257141,
      "reward_std": 0.27265700697898865,
      "rewards/reward_func/mean": 0.7221125364303589,
      "rewards/reward_func/std": 0.4268017113208771,
      "step": 612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.568201445043087,
      "epoch": 1.197265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2082817703485489,
      "learning_rate": 1.7789473684210528e-06,
      "loss": 0.0,
      "num_tokens": 2884240.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.028509706258773804,
      "rewards/reward_func/mean": 0.01875000074505806,
      "rewards/reward_func/std": 0.026726124808192253,
      "step": 613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6839352063834667,
      "epoch": 1.19921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18117974698543549,
      "learning_rate": 1.7736842105263162e-06,
      "loss": -0.0,
      "num_tokens": 2889224.0,
      "reward": 0.24700000882148743,
      "reward_std": 0.4290767312049866,
      "rewards/reward_func/mean": 0.24699999392032623,
      "rewards/reward_func/std": 0.39724788069725037,
      "step": 614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6872431337833405,
      "epoch": 1.201171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2561533749103546,
      "learning_rate": 1.768421052631579e-06,
      "loss": 0.0,
      "num_tokens": 2893976.0,
      "reward": 0.49361249804496765,
      "reward_std": 0.4846278727054596,
      "rewards/reward_func/mean": 0.49361249804496765,
      "rewards/reward_func/std": 0.5245352387428284,
      "step": 615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5922851376235485,
      "epoch": 1.203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20768021047115326,
      "learning_rate": 1.7631578947368423e-06,
      "loss": -0.0,
      "num_tokens": 2898728.0,
      "reward": 0.3634375035762787,
      "reward_std": 0.4954363703727722,
      "rewards/reward_func/mean": 0.3634375035762787,
      "rewards/reward_func/std": 0.4777008891105652,
      "step": 616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5938981436192989,
      "epoch": 1.205078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20559649169445038,
      "learning_rate": 1.7578947368421054e-06,
      "loss": 0.0,
      "num_tokens": 2903416.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8055331483483315,
      "epoch": 1.20703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26412415504455566,
      "learning_rate": 1.7526315789473686e-06,
      "loss": -0.0,
      "num_tokens": 2907952.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.2991751432418823,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7401883006095886,
      "epoch": 1.208984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2682240605354309,
      "learning_rate": 1.7473684210526318e-06,
      "loss": -0.0,
      "num_tokens": 2912488.0,
      "reward": 0.7504249811172485,
      "reward_std": 0.49200356006622314,
      "rewards/reward_func/mean": 0.7504249811172485,
      "rewards/reward_func/std": 0.4555671811103821,
      "step": 619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5817705541849136,
      "epoch": 1.2109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20313231647014618,
      "learning_rate": 1.742105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2917136.0,
      "reward": 0.23980000615119934,
      "reward_std": 0.4796000123023987,
      "rewards/reward_func/mean": 0.23980000615119934,
      "rewards/reward_func/std": 0.4441419541835785,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5425855070352554,
      "epoch": 1.212890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24401021003723145,
      "learning_rate": 1.736842105263158e-06,
      "loss": 0.0,
      "num_tokens": 2922088.0,
      "reward": 0.23787501454353333,
      "reward_std": 0.4510968029499054,
      "rewards/reward_func/mean": 0.23787499964237213,
      "rewards/reward_func/std": 0.4178653955459595,
      "step": 621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.43251657113432884,
      "epoch": 1.21484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2256747931241989,
      "learning_rate": 1.731578947368421e-06,
      "loss": -0.0,
      "num_tokens": 2927016.0,
      "reward": 0.38294997811317444,
      "reward_std": 0.5065792798995972,
      "rewards/reward_func/mean": 0.38294997811317444,
      "rewards/reward_func/std": 0.487602561712265,
      "step": 622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5035722590982914,
      "epoch": 1.216796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20219974219799042,
      "learning_rate": 1.7263157894736842e-06,
      "loss": -0.0,
      "num_tokens": 2931856.0,
      "reward": 0.7478624582290649,
      "reward_std": 0.46107497811317444,
      "rewards/reward_func/mean": 0.7478624582290649,
      "rewards/reward_func/std": 0.42688557505607605,
      "step": 623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6267634443938732,
      "epoch": 1.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2189279943704605,
      "learning_rate": 1.7210526315789474e-06,
      "loss": 0.0,
      "num_tokens": 2936608.0,
      "reward": 0.7312500476837158,
      "reward_std": 0.4379579424858093,
      "rewards/reward_func/mean": 0.7312500476837158,
      "rewards/reward_func/std": 0.40563005208969116,
      "step": 624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3103468529880047,
      "epoch": 1.220703125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.7157894736842107e-06,
      "loss": 0.0,
      "num_tokens": 2941112.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47227631881833076,
      "epoch": 1.22265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22579093277454376,
      "learning_rate": 1.710526315789474e-06,
      "loss": -0.0,
      "num_tokens": 2945904.0,
      "reward": 0.36162498593330383,
      "reward_std": 0.5129633545875549,
      "rewards/reward_func/mean": 0.36162498593330383,
      "rewards/reward_func/std": 0.4922618269920349,
      "step": 626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.2546942033804953,
      "epoch": 1.224609375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.705263157894737e-06,
      "loss": 0.0,
      "num_tokens": 2950400.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6319939978420734,
      "epoch": 1.2265625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14794784784317017,
      "learning_rate": 1.7000000000000002e-06,
      "loss": -0.0,
      "num_tokens": 2955392.0,
      "reward": 0.2150000035762787,
      "reward_std": 0.24852363765239716,
      "rewards/reward_func/mean": 0.2150000035762787,
      "rewards/reward_func/std": 0.39838388562202454,
      "step": 628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.716876182705164,
      "epoch": 1.228515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2454557716846466,
      "learning_rate": 1.6947368421052632e-06,
      "loss": 0.0,
      "num_tokens": 2960184.0,
      "reward": 0.3511999845504761,
      "reward_std": 0.5037546157836914,
      "rewards/reward_func/mean": 0.35120001435279846,
      "rewards/reward_func/std": 0.484712690114975,
      "step": 629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.35377194359898567,
      "epoch": 1.23046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19669418036937714,
      "learning_rate": 1.6894736842105264e-06,
      "loss": -0.0,
      "num_tokens": 2964688.0,
      "reward": 0.5809999704360962,
      "reward_std": 0.5007524490356445,
      "rewards/reward_func/mean": 0.5809999704360962,
      "rewards/reward_func/std": 0.4811137020587921,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5233113840222359,
      "epoch": 1.232421875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.6842105263157895e-06,
      "loss": 0.0,
      "num_tokens": 2969024.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49619767256081104,
      "epoch": 1.234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18364404141902924,
      "learning_rate": 1.6789473684210527e-06,
      "loss": 0.0,
      "num_tokens": 2974000.0,
      "reward": 0.2407499998807907,
      "reward_std": 0.26884350180625916,
      "rewards/reward_func/mean": 0.2407500147819519,
      "rewards/reward_func/std": 0.4001612961292267,
      "step": 632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6544572822749615,
      "epoch": 1.236328125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14807765185832977,
      "learning_rate": 1.673684210526316e-06,
      "loss": 0.0,
      "num_tokens": 2979000.0,
      "reward": 0.23649999499320984,
      "reward_std": 0.27312225103378296,
      "rewards/reward_func/mean": 0.23649999499320984,
      "rewards/reward_func/std": 0.4379509389400482,
      "step": 633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5549633167684078,
      "epoch": 1.23828125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12250755727291107,
      "learning_rate": 1.6684210526315792e-06,
      "loss": 0.0,
      "num_tokens": 2984016.0,
      "reward": 0.11140000075101852,
      "reward_std": 0.22280000150203705,
      "rewards/reward_func/mean": 0.11140000075101852,
      "rewards/reward_func/std": 0.3150867819786072,
      "step": 634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6770503744482994,
      "epoch": 1.240234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21593227982521057,
      "learning_rate": 1.6631578947368424e-06,
      "loss": 0.0,
      "num_tokens": 2988672.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.569544143974781,
      "epoch": 1.2421875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14851713180541992,
      "learning_rate": 1.6578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 2993032.0,
      "reward": 0.878125011920929,
      "reward_std": 0.24375002086162567,
      "rewards/reward_func/mean": 0.878125011920929,
      "rewards/reward_func/std": 0.34471458196640015,
      "step": 636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7452700100839138,
      "epoch": 1.244140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23980359733104706,
      "learning_rate": 1.6526315789473685e-06,
      "loss": -0.0,
      "num_tokens": 2998208.0,
      "reward": 0.2562499940395355,
      "reward_std": 0.3011751174926758,
      "rewards/reward_func/mean": 0.2562499940395355,
      "rewards/reward_func/std": 0.4593765139579773,
      "step": 637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5180889405310154,
      "epoch": 1.24609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2281617522239685,
      "learning_rate": 1.6473684210526317e-06,
      "loss": 0.0,
      "num_tokens": 3003064.0,
      "reward": 0.48240000009536743,
      "reward_std": 0.5435653924942017,
      "rewards/reward_func/mean": 0.48240000009536743,
      "rewards/reward_func/std": 0.5036382079124451,
      "step": 638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5895957332104445,
      "epoch": 1.248046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16368906199932098,
      "learning_rate": 1.6421052631578948e-06,
      "loss": 0.0,
      "num_tokens": 3007408.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.713409535586834,
      "epoch": 1.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2973420023918152,
      "learning_rate": 1.636842105263158e-06,
      "loss": 0.0,
      "num_tokens": 3012280.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6381527129560709,
      "epoch": 1.251953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20075997710227966,
      "learning_rate": 1.6315789473684212e-06,
      "loss": 0.0,
      "num_tokens": 3017240.0,
      "reward": 0.23475000262260437,
      "reward_std": 0.4529085159301758,
      "rewards/reward_func/mean": 0.23475000262260437,
      "rewards/reward_func/std": 0.41961437463760376,
      "step": 641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5148098133504391,
      "epoch": 1.25390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20074260234832764,
      "learning_rate": 1.6263157894736845e-06,
      "loss": 0.0,
      "num_tokens": 3022208.0,
      "reward": 0.22257500886917114,
      "reward_std": 0.26535123586654663,
      "rewards/reward_func/mean": 0.22257500886917114,
      "rewards/reward_func/std": 0.39029690623283386,
      "step": 642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7974681630730629,
      "epoch": 1.255859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24957697093486786,
      "learning_rate": 1.6210526315789473e-06,
      "loss": -0.0,
      "num_tokens": 3026848.0,
      "reward": 0.48893749713897705,
      "reward_std": 0.4796658158302307,
      "rewards/reward_func/mean": 0.48893749713897705,
      "rewards/reward_func/std": 0.512710690498352,
      "step": 643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6741067506372929,
      "epoch": 1.2578125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1625678539276123,
      "learning_rate": 1.6157894736842106e-06,
      "loss": -0.0,
      "num_tokens": 3031184.0,
      "reward": 0.9947500228881836,
      "reward_std": 0.010500003583729267,
      "rewards/reward_func/mean": 0.9947500228881836,
      "rewards/reward_func/std": 0.014849240891635418,
      "step": 644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6193629987537861,
      "epoch": 1.259765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23957446217536926,
      "learning_rate": 1.6105263157894738e-06,
      "loss": 0.0,
      "num_tokens": 3035928.0,
      "reward": 0.6013749837875366,
      "reward_std": 0.5029712319374084,
      "rewards/reward_func/mean": 0.6013749837875366,
      "rewards/reward_func/std": 0.4784298539161682,
      "step": 645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8137162700295448,
      "epoch": 1.26171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14833250641822815,
      "learning_rate": 1.605263157894737e-06,
      "loss": -0.0,
      "num_tokens": 3040408.0,
      "reward": 0.879687488079071,
      "reward_std": 0.24062499403953552,
      "rewards/reward_func/mean": 0.879687488079071,
      "rewards/reward_func/std": 0.34029513597488403,
      "step": 646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6442223079502583,
      "epoch": 1.263671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2811136841773987,
      "learning_rate": 1.6000000000000001e-06,
      "loss": 0.0,
      "num_tokens": 3045264.0,
      "reward": 0.350600004196167,
      "reward_std": 0.5062717199325562,
      "rewards/reward_func/mean": 0.350600004196167,
      "rewards/reward_func/std": 0.4842973053455353,
      "step": 647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3350534597411752,
      "epoch": 1.265625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1243121474981308,
      "learning_rate": 1.5947368421052633e-06,
      "loss": -0.0,
      "num_tokens": 3049752.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.575018860399723,
      "epoch": 1.267578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22247184813022614,
      "learning_rate": 1.5894736842105265e-06,
      "loss": 0.0,
      "num_tokens": 3054392.0,
      "reward": 0.5039750337600708,
      "reward_std": 0.5604059100151062,
      "rewards/reward_func/mean": 0.5039750337600708,
      "rewards/reward_func/std": 0.5189851522445679,
      "step": 649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3880303241312504,
      "epoch": 1.26953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1775999665260315,
      "learning_rate": 1.5842105263157894e-06,
      "loss": 0.0,
      "num_tokens": 3059368.0,
      "reward": 0.23032499849796295,
      "reward_std": 0.2687353491783142,
      "rewards/reward_func/mean": 0.23032499849796295,
      "rewards/reward_func/std": 0.4190002381801605,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4622784499078989,
      "epoch": 1.271484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1834346204996109,
      "learning_rate": 1.5789473684210526e-06,
      "loss": -0.0,
      "num_tokens": 3063872.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8611919060349464,
      "epoch": 1.2734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3340691924095154,
      "learning_rate": 1.573684210526316e-06,
      "loss": 0.0,
      "num_tokens": 3068736.0,
      "reward": 0.7515624761581421,
      "reward_std": 0.49687498807907104,
      "rewards/reward_func/mean": 0.7515624761581421,
      "rewards/reward_func/std": 0.4600290060043335,
      "step": 652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6393865756690502,
      "epoch": 1.275390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22418461740016937,
      "learning_rate": 1.5684210526315791e-06,
      "loss": 0.0,
      "num_tokens": 3073368.0,
      "reward": 0.4921249747276306,
      "reward_std": 0.5610401630401611,
      "rewards/reward_func/mean": 0.492125004529953,
      "rewards/reward_func/std": 0.519443690776825,
      "step": 653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5920063890516758,
      "epoch": 1.27734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26118144392967224,
      "learning_rate": 1.5631578947368423e-06,
      "loss": 0.0,
      "num_tokens": 3078168.0,
      "reward": 0.617437481880188,
      "reward_std": 0.25803983211517334,
      "rewards/reward_func/mean": 0.617437481880188,
      "rewards/reward_func/std": 0.4884096086025238,
      "step": 654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6909713298082352,
      "epoch": 1.279296875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18258942663669586,
      "learning_rate": 1.5578947368421054e-06,
      "loss": 0.0,
      "num_tokens": 3083008.0,
      "reward": 0.11959999799728394,
      "reward_std": 0.23919998109340668,
      "rewards/reward_func/mean": 0.11959999799728394,
      "rewards/reward_func/std": 0.3382799029350281,
      "step": 655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.657719861716032,
      "epoch": 1.28125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1802099496126175,
      "learning_rate": 1.5526315789473686e-06,
      "loss": 0.0,
      "num_tokens": 3087856.0,
      "reward": 0.11620000004768372,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.11620000004768372,
      "rewards/reward_func/std": 0.32866325974464417,
      "step": 656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6168779209256172,
      "epoch": 1.283203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1258898824453354,
      "learning_rate": 1.5473684210526316e-06,
      "loss": -0.0,
      "num_tokens": 3092488.0,
      "reward": 0.625,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6636889725923538,
      "epoch": 1.28515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2637583911418915,
      "learning_rate": 1.5421052631578947e-06,
      "loss": 0.0,
      "num_tokens": 3097240.0,
      "reward": 0.8300000429153442,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.8300000429153442,
      "rewards/reward_func/std": 0.33602720499038696,
      "step": 658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8549417704343796,
      "epoch": 1.287109375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.5368421052631579e-06,
      "loss": 0.0,
      "num_tokens": 3101800.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5893873460590839,
      "epoch": 1.2890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22459273040294647,
      "learning_rate": 1.5315789473684213e-06,
      "loss": -0.0,
      "num_tokens": 3106536.0,
      "reward": 0.598437488079071,
      "reward_std": 0.4938383102416992,
      "rewards/reward_func/mean": 0.5984375476837158,
      "rewards/reward_func/std": 0.47170084714889526,
      "step": 660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7923720888793468,
      "epoch": 1.291015625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18553104996681213,
      "learning_rate": 1.5263157894736844e-06,
      "loss": 0.0,
      "num_tokens": 3111304.0,
      "reward": 0.8560999631881714,
      "reward_std": 0.24459999799728394,
      "rewards/reward_func/mean": 0.8560999631881714,
      "rewards/reward_func/std": 0.34591665863990784,
      "step": 661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.46678472869098186,
      "epoch": 1.29296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.256147563457489,
      "learning_rate": 1.5210526315789476e-06,
      "loss": 0.0,
      "num_tokens": 3116096.0,
      "reward": 0.5954375267028809,
      "reward_std": 0.48662465810775757,
      "rewards/reward_func/mean": 0.5954374670982361,
      "rewards/reward_func/std": 0.4691931903362274,
      "step": 662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7971262335777283,
      "epoch": 1.294921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2475176900625229,
      "learning_rate": 1.5157894736842108e-06,
      "loss": -0.0,
      "num_tokens": 3120744.0,
      "reward": 0.5011875033378601,
      "reward_std": 0.5680569410324097,
      "rewards/reward_func/mean": 0.5011875033378601,
      "rewards/reward_func/std": 0.5259928107261658,
      "step": 663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4731484353542328,
      "epoch": 1.296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18087652325630188,
      "learning_rate": 1.5105263157894737e-06,
      "loss": 0.0,
      "num_tokens": 3125712.0,
      "reward": 0.11857499927282333,
      "reward_std": 0.2205643504858017,
      "rewards/reward_func/mean": 0.11857499927282333,
      "rewards/reward_func/std": 0.3053269386291504,
      "step": 664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7882614396512508,
      "epoch": 1.298828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2276054322719574,
      "learning_rate": 1.5052631578947369e-06,
      "loss": -0.0,
      "num_tokens": 3130712.0,
      "reward": 0.4634000062942505,
      "reward_std": 0.4610704183578491,
      "rewards/reward_func/mean": 0.4634000062942505,
      "rewards/reward_func/std": 0.4954361617565155,
      "step": 665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6726894378662109,
      "epoch": 1.30078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25436362624168396,
      "learning_rate": 1.5e-06,
      "loss": 0.0,
      "num_tokens": 3135368.0,
      "reward": 0.6215875148773193,
      "reward_std": 0.5241646766662598,
      "rewards/reward_func/mean": 0.6215875148773193,
      "rewards/reward_func/std": 0.5046280026435852,
      "step": 666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7788518108427525,
      "epoch": 1.302734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2387314885854721,
      "learning_rate": 1.4947368421052632e-06,
      "loss": 0.0,
      "num_tokens": 3140104.0,
      "reward": 0.721875011920929,
      "reward_std": 0.2759787440299988,
      "rewards/reward_func/mean": 0.721875011920929,
      "rewards/reward_func/std": 0.42338719964027405,
      "step": 667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6936579272150993,
      "epoch": 1.3046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21527528762817383,
      "learning_rate": 1.4894736842105264e-06,
      "loss": 0.0,
      "num_tokens": 3145096.0,
      "reward": 0.23100000619888306,
      "reward_std": 0.4619999825954437,
      "rewards/reward_func/mean": 0.23100000619888306,
      "rewards/reward_func/std": 0.42808806896209717,
      "step": 668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.649364996701479,
      "epoch": 1.306640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25936535000801086,
      "learning_rate": 1.4842105263157897e-06,
      "loss": -0.0,
      "num_tokens": 3149888.0,
      "reward": 0.8514000177383423,
      "reward_std": 0.2537573277950287,
      "rewards/reward_func/mean": 0.8514000177383423,
      "rewards/reward_func/std": 0.3446618318557739,
      "step": 669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7287271581590176,
      "epoch": 1.30859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17086577415466309,
      "learning_rate": 1.478947368421053e-06,
      "loss": 0.0,
      "num_tokens": 3154360.0,
      "reward": 0.7593749761581421,
      "reward_std": 0.27784982323646545,
      "rewards/reward_func/mean": 0.7593749761581421,
      "rewards/reward_func/std": 0.44555091857910156,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47401850298047066,
      "epoch": 1.310546875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17154361307621002,
      "learning_rate": 1.4736842105263159e-06,
      "loss": 0.0,
      "num_tokens": 3159040.0,
      "reward": 0.8828125,
      "reward_std": 0.234375,
      "rewards/reward_func/mean": 0.8828125,
      "rewards/reward_func/std": 0.3314563035964966,
      "step": 671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5196984093636274,
      "epoch": 1.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.185403510928154,
      "learning_rate": 1.468421052631579e-06,
      "loss": -0.0,
      "num_tokens": 3164008.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.016983473673462868,
      "rewards/reward_func/mean": 0.012500000186264515,
      "rewards/reward_func/std": 0.021128857508301735,
      "step": 672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3447682708501816,
      "epoch": 1.314453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18098026514053345,
      "learning_rate": 1.4631578947368422e-06,
      "loss": -0.0,
      "num_tokens": 3168520.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.46480000019073486,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5853043086826801,
      "epoch": 1.31640625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.4578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 3172856.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6624711155891418,
      "epoch": 1.318359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24813523888587952,
      "learning_rate": 1.4526315789473685e-06,
      "loss": 0.0,
      "num_tokens": 3177696.0,
      "reward": 0.7400499582290649,
      "reward_std": 0.4767000079154968,
      "rewards/reward_func/mean": 0.7400499582290649,
      "rewards/reward_func/std": 0.4415406882762909,
      "step": 675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8206646218895912,
      "epoch": 1.3203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2409617155790329,
      "learning_rate": 1.4473684210526317e-06,
      "loss": -0.0,
      "num_tokens": 3182448.0,
      "reward": 0.8735250234603882,
      "reward_std": 0.23993384838104248,
      "rewards/reward_func/mean": 0.8735250234603882,
      "rewards/reward_func/std": 0.32304173707962036,
      "step": 676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6698844209313393,
      "epoch": 1.322265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.225163534283638,
      "learning_rate": 1.442105263157895e-06,
      "loss": -0.0,
      "num_tokens": 3187224.0,
      "reward": 0.3618749976158142,
      "reward_std": 0.4973260462284088,
      "rewards/reward_func/mean": 0.3618749976158142,
      "rewards/reward_func/std": 0.4790313243865967,
      "step": 677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5397326983511448,
      "epoch": 1.32421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17920690774917603,
      "learning_rate": 1.4368421052631582e-06,
      "loss": -0.0,
      "num_tokens": 3191880.0,
      "reward": 0.49943751096725464,
      "reward_std": 0.49373185634613037,
      "rewards/reward_func/mean": 0.49943751096725464,
      "rewards/reward_func/std": 0.5242229104042053,
      "step": 678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6977795027196407,
      "epoch": 1.326171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26054197549819946,
      "learning_rate": 1.4315789473684212e-06,
      "loss": 0.0,
      "num_tokens": 3196416.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.4982522130012512,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7697173990309238,
      "epoch": 1.328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26796966791152954,
      "learning_rate": 1.4263157894736843e-06,
      "loss": 0.0,
      "num_tokens": 3201416.0,
      "reward": 0.2418999969959259,
      "reward_std": 0.4837999939918518,
      "rewards/reward_func/mean": 0.2418999969959259,
      "rewards/reward_func/std": 0.44794899225234985,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5212496444582939,
      "epoch": 1.330078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21516187489032745,
      "learning_rate": 1.4210526315789475e-06,
      "loss": -0.0,
      "num_tokens": 3206208.0,
      "reward": 0.24524998664855957,
      "reward_std": 0.4739798605442047,
      "rewards/reward_func/mean": 0.24524998664855957,
      "rewards/reward_func/std": 0.43882009387016296,
      "step": 681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.694312822073698,
      "epoch": 1.33203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16465605795383453,
      "learning_rate": 1.4157894736842107e-06,
      "loss": 0.0,
      "num_tokens": 3210816.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6481199897825718,
      "epoch": 1.333984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23585893213748932,
      "learning_rate": 1.4105263157894738e-06,
      "loss": 0.0,
      "num_tokens": 3215672.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.4966987073421478,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7410243712365627,
      "epoch": 1.3359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24909096956253052,
      "learning_rate": 1.405263157894737e-06,
      "loss": -0.0,
      "num_tokens": 3220472.0,
      "reward": 0.23709999024868011,
      "reward_std": 0.4575529098510742,
      "rewards/reward_func/mean": 0.2371000051498413,
      "rewards/reward_func/std": 0.42398691177368164,
      "step": 684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.34077938739210367,
      "epoch": 1.337890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1768733263015747,
      "learning_rate": 1.4000000000000001e-06,
      "loss": 0.0,
      "num_tokens": 3224976.0,
      "reward": 0.8226249814033508,
      "reward_std": 0.2383500039577484,
      "rewards/reward_func/mean": 0.8226249814033508,
      "rewards/reward_func/std": 0.3227412700653076,
      "step": 685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5411074692383409,
      "epoch": 1.33984375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11871396750211716,
      "learning_rate": 1.394736842105263e-06,
      "loss": 0.0,
      "num_tokens": 3229472.0,
      "reward": 0.8113000392913818,
      "reward_std": 0.23103395104408264,
      "rewards/reward_func/mean": 0.8113000392913818,
      "rewards/reward_func/std": 0.32786741852760315,
      "step": 686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.45428065955638885,
      "epoch": 1.341796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1815212070941925,
      "learning_rate": 1.3894736842105263e-06,
      "loss": 0.0,
      "num_tokens": 3234432.0,
      "reward": 0.43700000643730164,
      "reward_std": 0.4397338032722473,
      "rewards/reward_func/mean": 0.43700000643730164,
      "rewards/reward_func/std": 0.46753212809562683,
      "step": 687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5205916427075863,
      "epoch": 1.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21481692790985107,
      "learning_rate": 1.3842105263157896e-06,
      "loss": -0.0,
      "num_tokens": 3239224.0,
      "reward": 0.13824999332427979,
      "reward_std": 0.24868974089622498,
      "rewards/reward_func/mean": 0.13824999332427979,
      "rewards/reward_func/std": 0.33114388585090637,
      "step": 688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47748175263404846,
      "epoch": 1.345703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1996050328016281,
      "learning_rate": 1.3789473684210528e-06,
      "loss": -0.0,
      "num_tokens": 3244008.0,
      "reward": 0.3589249849319458,
      "reward_std": 0.509075939655304,
      "rewards/reward_func/mean": 0.3589249849319458,
      "rewards/reward_func/std": 0.4885358214378357,
      "step": 689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7458081506192684,
      "epoch": 1.34765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1602766215801239,
      "learning_rate": 1.373684210526316e-06,
      "loss": -0.0,
      "num_tokens": 3248512.0,
      "reward": 0.8754249811172485,
      "reward_std": 0.24200356006622314,
      "rewards/reward_func/mean": 0.8754249811172485,
      "rewards/reward_func/std": 0.34370672702789307,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5473680794239044,
      "epoch": 1.349609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2388388216495514,
      "learning_rate": 1.3684210526315791e-06,
      "loss": 0.0,
      "num_tokens": 3252872.0,
      "reward": 0.515625,
      "reward_std": 0.4859190583229065,
      "rewards/reward_func/mean": 0.515625,
      "rewards/reward_func/std": 0.5182280540466309,
      "step": 691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3880773186683655,
      "epoch": 1.3515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21450631320476532,
      "learning_rate": 1.3631578947368423e-06,
      "loss": 0.0,
      "num_tokens": 3257656.0,
      "reward": 0.5974999666213989,
      "reward_std": 0.5149734020233154,
      "rewards/reward_func/mean": 0.5974999666213989,
      "rewards/reward_func/std": 0.49477702379226685,
      "step": 692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5613982677459717,
      "epoch": 1.353515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20392994582653046,
      "learning_rate": 1.3578947368421052e-06,
      "loss": 0.0,
      "num_tokens": 3262312.0,
      "reward": 0.7505750060081482,
      "reward_std": 0.4759466052055359,
      "rewards/reward_func/mean": 0.7505750060081482,
      "rewards/reward_func/std": 0.44124478101730347,
      "step": 693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6902660578489304,
      "epoch": 1.35546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2161121368408203,
      "learning_rate": 1.3526315789473684e-06,
      "loss": -0.0,
      "num_tokens": 3267328.0,
      "reward": 0.5890499949455261,
      "reward_std": 0.5059839487075806,
      "rewards/reward_func/mean": 0.5890499949455261,
      "rewards/reward_func/std": 0.4884392023086548,
      "step": 694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5899221636354923,
      "epoch": 1.357421875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.3473684210526316e-06,
      "loss": 0.0,
      "num_tokens": 3272280.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9027910940349102,
      "epoch": 1.359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25422924757003784,
      "learning_rate": 1.342105263157895e-06,
      "loss": 0.0,
      "num_tokens": 3276816.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.4917367100715637,
      "rewards/reward_func/mean": 0.5062500238418579,
      "rewards/reward_func/std": 0.5279255509376526,
      "step": 696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6606407798826694,
      "epoch": 1.361328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2949965000152588,
      "learning_rate": 1.3368421052631581e-06,
      "loss": -0.0,
      "num_tokens": 3281448.0,
      "reward": 0.7399374842643738,
      "reward_std": 0.46412500739097595,
      "rewards/reward_func/mean": 0.7399374842643738,
      "rewards/reward_func/std": 0.43033212423324585,
      "step": 697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6454514786601067,
      "epoch": 1.36328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2416992336511612,
      "learning_rate": 1.3315789473684213e-06,
      "loss": 0.0,
      "num_tokens": 3285792.0,
      "reward": 0.762499988079071,
      "reward_std": 0.4749999940395355,
      "rewards/reward_func/mean": 0.762499988079071,
      "rewards/reward_func/std": 0.4397645592689514,
      "step": 698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6855951938778162,
      "epoch": 1.365234375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15963053703308105,
      "learning_rate": 1.3263157894736844e-06,
      "loss": 0.0,
      "num_tokens": 3290760.0,
      "reward": 0.11100000143051147,
      "reward_std": 0.22200000286102295,
      "rewards/reward_func/mean": 0.11100000143051147,
      "rewards/reward_func/std": 0.3139553964138031,
      "step": 699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7190686650574207,
      "epoch": 1.3671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1730789840221405,
      "learning_rate": 1.3210526315789474e-06,
      "loss": 0.0,
      "num_tokens": 3295248.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6450852937996387,
      "epoch": 1.369140625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.3157894736842106e-06,
      "loss": 0.0,
      "num_tokens": 3299880.0,
      "reward": 0.972000002861023,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.972000002861023,
      "rewards/reward_func/std": 0.0,
      "step": 701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8256732821464539,
      "epoch": 1.37109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2364703118801117,
      "learning_rate": 1.3105263157894737e-06,
      "loss": 0.0,
      "num_tokens": 3304512.0,
      "reward": 0.8487499952316284,
      "reward_std": 0.24650000035762787,
      "rewards/reward_func/mean": 0.8487499952316284,
      "rewards/reward_func/std": 0.3429817855358124,
      "step": 702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6302417553961277,
      "epoch": 1.373046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1295423060655594,
      "learning_rate": 1.3052631578947369e-06,
      "loss": 0.0,
      "num_tokens": 3309296.0,
      "reward": 0.11749999970197678,
      "reward_std": 0.23499999940395355,
      "rewards/reward_func/mean": 0.11749999970197678,
      "rewards/reward_func/std": 0.33234018087387085,
      "step": 703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.957762099802494,
      "epoch": 1.375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18025648593902588,
      "learning_rate": 1.3e-06,
      "loss": 0.0,
      "num_tokens": 3314128.0,
      "reward": 0.11959999799728394,
      "reward_std": 0.23919998109340668,
      "rewards/reward_func/mean": 0.11959999799728394,
      "rewards/reward_func/std": 0.3382799029350281,
      "step": 704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7645817883312702,
      "epoch": 1.376953125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1874340921640396,
      "learning_rate": 1.2947368421052634e-06,
      "loss": -0.0,
      "num_tokens": 3318736.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.2827429473400116,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.732977319508791,
      "epoch": 1.37890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23621387779712677,
      "learning_rate": 1.2894736842105266e-06,
      "loss": -0.0,
      "num_tokens": 3323384.0,
      "reward": 0.8697500228881836,
      "reward_std": 0.2605000138282776,
      "rewards/reward_func/mean": 0.8697500228881836,
      "rewards/reward_func/std": 0.3517392575740814,
      "step": 706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8189778625965118,
      "epoch": 1.380859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16602914035320282,
      "learning_rate": 1.2842105263157895e-06,
      "loss": -0.0,
      "num_tokens": 3327856.0,
      "reward": 0.9947500228881836,
      "reward_std": 0.010500003583729267,
      "rewards/reward_func/mean": 0.9947500228881836,
      "rewards/reward_func/std": 0.014849240891635418,
      "step": 707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5835609417408705,
      "epoch": 1.3828125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.20208346843719482,
      "learning_rate": 1.2789473684210527e-06,
      "loss": 0.0,
      "num_tokens": 3332536.0,
      "reward": 0.635937511920929,
      "reward_std": 0.24358300864696503,
      "rewards/reward_func/mean": 0.635937511920929,
      "rewards/reward_func/std": 0.5031790733337402,
      "step": 708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3960736747831106,
      "epoch": 1.384765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19159826636314392,
      "learning_rate": 1.2736842105263159e-06,
      "loss": 0.0,
      "num_tokens": 3337040.0,
      "reward": 0.8113000392913818,
      "reward_std": 0.23659999668598175,
      "rewards/reward_func/mean": 0.8113000392913818,
      "rewards/reward_func/std": 0.32786741852760315,
      "step": 709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7697632871568203,
      "epoch": 1.38671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16308048367500305,
      "learning_rate": 1.268421052631579e-06,
      "loss": 0.0,
      "num_tokens": 3342048.0,
      "reward": 0.12229999899864197,
      "reward_std": 0.24459999799728394,
      "rewards/reward_func/mean": 0.12229999899864197,
      "rewards/reward_func/std": 0.34591662883758545,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6840203031897545,
      "epoch": 1.388671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23875313997268677,
      "learning_rate": 1.2631578947368422e-06,
      "loss": 0.0,
      "num_tokens": 3346576.0,
      "reward": 0.7488625049591064,
      "reward_std": 0.4951278567314148,
      "rewards/reward_func/mean": 0.7488625049591064,
      "rewards/reward_func/std": 0.4584231972694397,
      "step": 711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6921983473002911,
      "epoch": 1.390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.21332110464572906,
      "learning_rate": 1.2578947368421054e-06,
      "loss": 0.0,
      "num_tokens": 3351320.0,
      "reward": 0.47312501072883606,
      "reward_std": 0.0062500000931322575,
      "rewards/reward_func/mean": 0.47312498092651367,
      "rewards/reward_func/std": 0.4991774260997772,
      "step": 712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7641146704554558,
      "epoch": 1.392578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21671804785728455,
      "learning_rate": 1.2526315789473687e-06,
      "loss": 0.0,
      "num_tokens": 3356000.0,
      "reward": 0.8525000214576721,
      "reward_std": 0.25541630387306213,
      "rewards/reward_func/mean": 0.8525000214576721,
      "rewards/reward_func/std": 0.34573936462402344,
      "step": 713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8032694086432457,
      "epoch": 1.39453125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.2227100133895874,
      "learning_rate": 1.2473684210526317e-06,
      "loss": 0.0,
      "num_tokens": 3360616.0,
      "reward": 0.8529999852180481,
      "reward_std": 0.2427220493555069,
      "rewards/reward_func/mean": 0.8529999852180481,
      "rewards/reward_func/std": 0.3447500169277191,
      "step": 714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.20121474005281925,
      "epoch": 1.396484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11205914616584778,
      "learning_rate": 1.2421052631578948e-06,
      "loss": -0.0,
      "num_tokens": 3365096.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7984618470072746,
      "epoch": 1.3984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23668164014816284,
      "learning_rate": 1.236842105263158e-06,
      "loss": -0.0,
      "num_tokens": 3369616.0,
      "reward": 0.7487249970436096,
      "reward_std": 0.28503718972206116,
      "rewards/reward_func/mean": 0.7487249970436096,
      "rewards/reward_func/std": 0.4394227862358093,
      "step": 716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6621723957359791,
      "epoch": 1.400390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21383367478847504,
      "learning_rate": 1.2315789473684212e-06,
      "loss": -0.0,
      "num_tokens": 3374384.0,
      "reward": 0.5983499884605408,
      "reward_std": 0.5136319994926453,
      "rewards/reward_func/mean": 0.5983499884605408,
      "rewards/reward_func/std": 0.4956625998020172,
      "step": 717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6606850810348988,
      "epoch": 1.40234375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17316952347755432,
      "learning_rate": 1.2263157894736843e-06,
      "loss": 0.0,
      "num_tokens": 3379384.0,
      "reward": 0.11959999799728394,
      "reward_std": 0.23919998109340668,
      "rewards/reward_func/mean": 0.11959999799728394,
      "rewards/reward_func/std": 0.3382799029350281,
      "step": 718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.23659847397357225,
      "epoch": 1.404296875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13336242735385895,
      "learning_rate": 1.2210526315789475e-06,
      "loss": -0.0,
      "num_tokens": 3383896.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6463708803057671,
      "epoch": 1.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25290825963020325,
      "learning_rate": 1.2157894736842107e-06,
      "loss": 0.0,
      "num_tokens": 3388760.0,
      "reward": 0.7478749752044678,
      "reward_std": 0.49045124650001526,
      "rewards/reward_func/mean": 0.7478749752044678,
      "rewards/reward_func/std": 0.4541146159172058,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7543365843594074,
      "epoch": 1.408203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24611547589302063,
      "learning_rate": 1.2105263157894738e-06,
      "loss": 0.0,
      "num_tokens": 3393488.0,
      "reward": 0.8598625063896179,
      "reward_std": 0.2406606525182724,
      "rewards/reward_func/mean": 0.8598625063896179,
      "rewards/reward_func/std": 0.3235507011413574,
      "step": 721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.2327856719493866,
      "epoch": 1.41015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1801598221063614,
      "learning_rate": 1.205263157894737e-06,
      "loss": -0.0,
      "num_tokens": 3398000.0,
      "reward": 0.7059999704360962,
      "reward_std": 0.2859524190425873,
      "rewards/reward_func/mean": 0.7059999704360962,
      "rewards/reward_func/std": 0.43642914295196533,
      "step": 722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4816428292542696,
      "epoch": 1.412109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20217570662498474,
      "learning_rate": 1.2000000000000002e-06,
      "loss": 0.0,
      "num_tokens": 3402960.0,
      "reward": 0.10868749767541885,
      "reward_std": 0.210587739944458,
      "rewards/reward_func/mean": 0.10868749767541885,
      "rewards/reward_func/std": 0.2923278510570526,
      "step": 723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5045717675238848,
      "epoch": 1.4140625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12922586500644684,
      "learning_rate": 1.1947368421052633e-06,
      "loss": 0.0,
      "num_tokens": 3407472.0,
      "reward": 0.805899977684021,
      "reward_std": 0.22783933579921722,
      "rewards/reward_func/mean": 0.805899977684021,
      "rewards/reward_func/std": 0.326308935880661,
      "step": 724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.551116107031703,
      "epoch": 1.416015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2109834998846054,
      "learning_rate": 1.1894736842105265e-06,
      "loss": -0.0,
      "num_tokens": 3412256.0,
      "reward": 0.24056249856948853,
      "reward_std": 0.47697657346725464,
      "rewards/reward_func/mean": 0.24056249856948853,
      "rewards/reward_func/std": 0.44159868359565735,
      "step": 725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5529836490750313,
      "epoch": 1.41796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22575944662094116,
      "learning_rate": 1.1842105263157894e-06,
      "loss": 0.0,
      "num_tokens": 3417056.0,
      "reward": 0.4667624831199646,
      "reward_std": 0.535490095615387,
      "rewards/reward_func/mean": 0.4667624831199646,
      "rewards/reward_func/std": 0.4958255887031555,
      "step": 726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.39940457232296467,
      "epoch": 1.419921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18727923929691315,
      "learning_rate": 1.1789473684210526e-06,
      "loss": 0.0,
      "num_tokens": 3421848.0,
      "reward": 0.48112499713897705,
      "reward_std": 0.548362135887146,
      "rewards/reward_func/mean": 0.48112499713897705,
      "rewards/reward_func/std": 0.5076847076416016,
      "step": 727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7712520267814398,
      "epoch": 1.421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23952317237854004,
      "learning_rate": 1.173684210526316e-06,
      "loss": -0.0,
      "num_tokens": 3426856.0,
      "reward": 0.23100000619888306,
      "reward_std": 0.4619999825954437,
      "rewards/reward_func/mean": 0.23100000619888306,
      "rewards/reward_func/std": 0.42808806896209717,
      "step": 728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7155905775725842,
      "epoch": 1.423828125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14729009568691254,
      "learning_rate": 1.1684210526315791e-06,
      "loss": -0.0,
      "num_tokens": 3431856.0,
      "reward": 0.3450999855995178,
      "reward_std": 0.2304711490869522,
      "rewards/reward_func/mean": 0.3450999855995178,
      "rewards/reward_func/std": 0.47661837935447693,
      "step": 729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5209857858717442,
      "epoch": 1.42578125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17405037581920624,
      "learning_rate": 1.163157894736842e-06,
      "loss": -0.0,
      "num_tokens": 3436200.0,
      "reward": 0.7718750238418579,
      "reward_std": 0.2634160816669464,
      "rewards/reward_func/mean": 0.7718750238418579,
      "rewards/reward_func/std": 0.4224054515361786,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.75539655610919,
      "epoch": 1.427734375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18450260162353516,
      "learning_rate": 1.1578947368421053e-06,
      "loss": 0.0,
      "num_tokens": 3440560.0,
      "reward": 0.7578125,
      "reward_std": 0.2797587811946869,
      "rewards/reward_func/mean": 0.7578125,
      "rewards/reward_func/std": 0.4485560953617096,
      "step": 731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6287768185138702,
      "epoch": 1.4296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21361912786960602,
      "learning_rate": 1.1526315789473686e-06,
      "loss": -0.0,
      "num_tokens": 3445208.0,
      "reward": 0.8587499856948853,
      "reward_std": 0.2550273835659027,
      "rewards/reward_func/mean": 0.8587499856948853,
      "rewards/reward_func/std": 0.3472658097743988,
      "step": 732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8078473433852196,
      "epoch": 1.431640625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1746349185705185,
      "learning_rate": 1.1473684210526316e-06,
      "loss": 0.0,
      "num_tokens": 3449704.0,
      "reward": 0.753125011920929,
      "reward_std": 0.2851123511791229,
      "rewards/reward_func/mean": 0.753125011920929,
      "rewards/reward_func/std": 0.4571725130081177,
      "step": 733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6579478699713945,
      "epoch": 1.43359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20288410782814026,
      "learning_rate": 1.1421052631578947e-06,
      "loss": 0.0,
      "num_tokens": 3454664.0,
      "reward": 0.3410625159740448,
      "reward_std": 0.4895520806312561,
      "rewards/reward_func/mean": 0.3410625159740448,
      "rewards/reward_func/std": 0.4675552546977997,
      "step": 734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6510977298021317,
      "epoch": 1.435546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21478044986724854,
      "learning_rate": 1.136842105263158e-06,
      "loss": 0.0,
      "num_tokens": 3459456.0,
      "reward": 0.7142499685287476,
      "reward_std": 0.28147342801094055,
      "rewards/reward_func/mean": 0.7142499685287476,
      "rewards/reward_func/std": 0.4409100115299225,
      "step": 735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.22573751397430897,
      "epoch": 1.4375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.1315789473684213e-06,
      "loss": 0.0,
      "num_tokens": 3463960.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6214800626039505,
      "epoch": 1.439453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2706398665904999,
      "learning_rate": 1.1263157894736842e-06,
      "loss": -0.0,
      "num_tokens": 3468760.0,
      "reward": 0.6126375198364258,
      "reward_std": 0.5190345048904419,
      "rewards/reward_func/mean": 0.6126375198364258,
      "rewards/reward_func/std": 0.4971480667591095,
      "step": 737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.860218707472086,
      "epoch": 1.44140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24548137187957764,
      "learning_rate": 1.1210526315789474e-06,
      "loss": 0.0,
      "num_tokens": 3473256.0,
      "reward": 0.6269875168800354,
      "reward_std": 0.53028404712677,
      "rewards/reward_func/mean": 0.6269874572753906,
      "rewards/reward_func/std": 0.5090279579162598,
      "step": 738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.0728548988699913,
      "epoch": 1.443359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27242031693458557,
      "learning_rate": 1.1157894736842106e-06,
      "loss": -0.0,
      "num_tokens": 3477752.0,
      "reward": 0.754687488079071,
      "reward_std": 0.4906249940395355,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5216426700353622,
      "epoch": 1.4453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21481481194496155,
      "learning_rate": 1.110526315789474e-06,
      "loss": -0.0,
      "num_tokens": 3482712.0,
      "reward": 0.11024999618530273,
      "reward_std": 0.2122509479522705,
      "rewards/reward_func/mean": 0.11024999618530273,
      "rewards/reward_func/std": 0.29184964299201965,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5817220564931631,
      "epoch": 1.447265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2324642688035965,
      "learning_rate": 1.1052631578947369e-06,
      "loss": 0.0,
      "num_tokens": 3487360.0,
      "reward": 0.36861249804496765,
      "reward_std": 0.5240679979324341,
      "rewards/reward_func/mean": 0.36861249804496765,
      "rewards/reward_func/std": 0.5054256319999695,
      "step": 741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6618504039943218,
      "epoch": 1.44921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22483286261558533,
      "learning_rate": 1.1e-06,
      "loss": -0.0,
      "num_tokens": 3492376.0,
      "reward": 0.3586999773979187,
      "reward_std": 0.513043999671936,
      "rewards/reward_func/mean": 0.3586999773979187,
      "rewards/reward_func/std": 0.4951876997947693,
      "step": 742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8393836617469788,
      "epoch": 1.451171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27230191230773926,
      "learning_rate": 1.0947368421052632e-06,
      "loss": 0.0,
      "num_tokens": 3497008.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.4834516644477844,
      "rewards/reward_func/mean": 0.7250000238418579,
      "rewards/reward_func/std": 0.44761592149734497,
      "step": 743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.675093300640583,
      "epoch": 1.453125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14707615971565247,
      "learning_rate": 1.0894736842105264e-06,
      "loss": 0.0,
      "num_tokens": 3501864.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7936474606394768,
      "epoch": 1.455078125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.19649529457092285,
      "learning_rate": 1.0842105263157895e-06,
      "loss": 0.0,
      "num_tokens": 3506456.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5444208905100822,
      "epoch": 1.45703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1466275006532669,
      "learning_rate": 1.0789473684210527e-06,
      "loss": 0.0,
      "num_tokens": 3511136.0,
      "reward": 0.7593749761581421,
      "reward_std": 0.2780371904373169,
      "rewards/reward_func/mean": 0.7593749761581421,
      "rewards/reward_func/std": 0.44575127959251404,
      "step": 746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4803670160472393,
      "epoch": 1.458984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20859357714653015,
      "learning_rate": 1.0736842105263159e-06,
      "loss": 0.0,
      "num_tokens": 3515912.0,
      "reward": 0.23980000615119934,
      "reward_std": 0.4796000123023987,
      "rewards/reward_func/mean": 0.23980000615119934,
      "rewards/reward_func/std": 0.4441418945789337,
      "step": 747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5034095216542482,
      "epoch": 1.4609375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1568986475467682,
      "learning_rate": 1.068421052631579e-06,
      "loss": -0.0,
      "num_tokens": 3520712.0,
      "reward": 0.1288749873638153,
      "reward_std": 0.23274998366832733,
      "rewards/reward_func/mean": 0.1288749873638153,
      "rewards/reward_func/std": 0.33443784713745117,
      "step": 748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7676276192069054,
      "epoch": 1.462890625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1505260467529297,
      "learning_rate": 1.0631578947368422e-06,
      "loss": 0.0,
      "num_tokens": 3525728.0,
      "reward": 0.22550000250339508,
      "reward_std": 0.2604222893714905,
      "rewards/reward_func/mean": 0.22550000250339508,
      "rewards/reward_func/std": 0.41758477687835693,
      "step": 749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7294511571526527,
      "epoch": 1.46484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2366306334733963,
      "learning_rate": 1.0578947368421054e-06,
      "loss": 0.0,
      "num_tokens": 3530360.0,
      "reward": 0.8474999666213989,
      "reward_std": 0.2550317049026489,
      "rewards/reward_func/mean": 0.8474999666213989,
      "rewards/reward_func/std": 0.34281647205352783,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7323691435158253,
      "epoch": 1.466796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28612589836120605,
      "learning_rate": 1.0526315789473685e-06,
      "loss": 0.0,
      "num_tokens": 3535048.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.4966987073421478,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.659924540668726,
      "epoch": 1.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25231942534446716,
      "learning_rate": 1.0473684210526317e-06,
      "loss": -0.0,
      "num_tokens": 3539400.0,
      "reward": 0.8592499494552612,
      "reward_std": 0.25882306694984436,
      "rewards/reward_func/mean": 0.8592499494552612,
      "rewards/reward_func/std": 0.3478110134601593,
      "step": 752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6127423346042633,
      "epoch": 1.470703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19419050216674805,
      "learning_rate": 1.0421052631578949e-06,
      "loss": -0.0,
      "num_tokens": 3544256.0,
      "reward": 0.3825249969959259,
      "reward_std": 0.5170859694480896,
      "rewards/reward_func/mean": 0.3825249969959259,
      "rewards/reward_func/std": 0.49410948157310486,
      "step": 753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5326105952262878,
      "epoch": 1.47265625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1416216343641281,
      "learning_rate": 1.036842105263158e-06,
      "loss": -0.0,
      "num_tokens": 3549040.0,
      "reward": 0.12262499332427979,
      "reward_std": 0.23698993027210236,
      "rewards/reward_func/mean": 0.12262499332427979,
      "rewards/reward_func/std": 0.3368479907512665,
      "step": 754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.65166400000453,
      "epoch": 1.474609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26254624128341675,
      "learning_rate": 1.0315789473684212e-06,
      "loss": 0.0,
      "num_tokens": 3554032.0,
      "reward": 0.23919999599456787,
      "reward_std": 0.47839999198913574,
      "rewards/reward_func/mean": 0.23919999599456787,
      "rewards/reward_func/std": 0.44306278228759766,
      "step": 755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6490476988255978,
      "epoch": 1.4765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15658609569072723,
      "learning_rate": 1.0263157894736843e-06,
      "loss": 0.0,
      "num_tokens": 3558904.0,
      "reward": 0.878125011920929,
      "reward_std": 0.24375002086162567,
      "rewards/reward_func/mean": 0.878125011920929,
      "rewards/reward_func/std": 0.34471458196640015,
      "step": 756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4935445673763752,
      "epoch": 1.478515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21735656261444092,
      "learning_rate": 1.0210526315789475e-06,
      "loss": 0.0,
      "num_tokens": 3563760.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6523591168224812,
      "epoch": 1.48046875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.26018184423446655,
      "learning_rate": 1.0157894736842105e-06,
      "loss": -0.0,
      "num_tokens": 3568288.0,
      "reward": 0.8710500001907349,
      "reward_std": 0.2310333549976349,
      "rewards/reward_func/mean": 0.8710500001907349,
      "rewards/reward_func/std": 0.3324243724346161,
      "step": 758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8797458559274673,
      "epoch": 1.482421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2709335386753082,
      "learning_rate": 1.0105263157894738e-06,
      "loss": 0.0,
      "num_tokens": 3572824.0,
      "reward": 0.49787500500679016,
      "reward_std": 0.567833423614502,
      "rewards/reward_func/mean": 0.49787500500679016,
      "rewards/reward_func/std": 0.5257702469825745,
      "step": 759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49080283008515835,
      "epoch": 1.484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16781467199325562,
      "learning_rate": 1.005263157894737e-06,
      "loss": -0.0,
      "num_tokens": 3577312.0,
      "reward": 0.9384000301361084,
      "reward_std": 0.01759999990463257,
      "rewards/reward_func/mean": 0.9384000301361084,
      "rewards/reward_func/std": 0.02489016018807888,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6200328879058361,
      "epoch": 1.486328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21412330865859985,
      "learning_rate": 1.0000000000000002e-06,
      "loss": -0.0,
      "num_tokens": 3581960.0,
      "reward": 0.49184998869895935,
      "reward_std": 0.5679755210876465,
      "rewards/reward_func/mean": 0.49184998869895935,
      "rewards/reward_func/std": 0.5259869694709778,
      "step": 761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.32999411411583424,
      "epoch": 1.48828125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.947368421052631e-07,
      "loss": 0.0,
      "num_tokens": 3586456.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5745944939553738,
      "epoch": 1.490234375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14827704429626465,
      "learning_rate": 9.894736842105265e-07,
      "loss": -0.0,
      "num_tokens": 3591216.0,
      "reward": 0.3511999845504761,
      "reward_std": 0.23414616286754608,
      "rewards/reward_func/mean": 0.3511999845504761,
      "rewards/reward_func/std": 0.484712690114975,
      "step": 763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7484982162714005,
      "epoch": 1.4921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2768227159976959,
      "learning_rate": 9.842105263157897e-07,
      "loss": 0.0,
      "num_tokens": 3595704.0,
      "reward": 0.8731499910354614,
      "reward_std": 0.2419903725385666,
      "rewards/reward_func/mean": 0.8731499910354614,
      "rewards/reward_func/std": 0.3327745497226715,
      "step": 764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.558552972972393,
      "epoch": 1.494140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24600715935230255,
      "learning_rate": 9.789473684210526e-07,
      "loss": -0.0,
      "num_tokens": 3600560.0,
      "reward": 0.353300005197525,
      "reward_std": 0.5092028975486755,
      "rewards/reward_func/mean": 0.353300005197525,
      "rewards/reward_func/std": 0.4877893924713135,
      "step": 765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.627804696559906,
      "epoch": 1.49609375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.34988823533058167,
      "learning_rate": 9.736842105263158e-07,
      "loss": 0.0,
      "num_tokens": 3605400.0,
      "reward": 0.11959999799728394,
      "reward_std": 0.23919998109340668,
      "rewards/reward_func/mean": 0.11959999799728394,
      "rewards/reward_func/std": 0.3382799029350281,
      "step": 766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.40967404283583164,
      "epoch": 1.498046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20395620167255402,
      "learning_rate": 9.68421052631579e-07,
      "loss": -0.0,
      "num_tokens": 3610384.0,
      "reward": 0.33976250886917114,
      "reward_std": 0.2288047820329666,
      "rewards/reward_func/mean": 0.33976250886917114,
      "rewards/reward_func/std": 0.4656626880168915,
      "step": 767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.758793193846941,
      "epoch": 1.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28868770599365234,
      "learning_rate": 9.631578947368423e-07,
      "loss": 0.0,
      "num_tokens": 3615248.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8679467141628265,
      "epoch": 1.501953125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.19479088485240936,
      "learning_rate": 9.578947368421053e-07,
      "loss": 0.0,
      "num_tokens": 3619800.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.665732316672802,
      "epoch": 1.50390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17135152220726013,
      "learning_rate": 9.526315789473685e-07,
      "loss": 0.0,
      "num_tokens": 3624288.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.654691674746573,
      "epoch": 1.505859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1414864957332611,
      "learning_rate": 9.473684210526317e-07,
      "loss": 0.0,
      "num_tokens": 3629144.0,
      "reward": 0.6349375247955322,
      "reward_std": 0.22897498309612274,
      "rewards/reward_func/mean": 0.6349375247955322,
      "rewards/reward_func/std": 0.4740232825279236,
      "step": 771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5642881374806166,
      "epoch": 1.5078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2399088591337204,
      "learning_rate": 9.421052631578948e-07,
      "loss": -0.0,
      "num_tokens": 3633928.0,
      "reward": 0.3761124610900879,
      "reward_std": 0.49379342794418335,
      "rewards/reward_func/mean": 0.3761124908924103,
      "rewards/reward_func/std": 0.47505778074264526,
      "step": 772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8916216120123863,
      "epoch": 1.509765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25461268424987793,
      "learning_rate": 9.368421052631579e-07,
      "loss": -0.0,
      "num_tokens": 3638416.0,
      "reward": 0.7593749761581421,
      "reward_std": 0.48124998807907104,
      "rewards/reward_func/mean": 0.7593749761581421,
      "rewards/reward_func/std": 0.44555091857910156,
      "step": 773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3287378936074674,
      "epoch": 1.51171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16437959671020508,
      "learning_rate": 9.315789473684212e-07,
      "loss": 0.0,
      "num_tokens": 3642912.0,
      "reward": 0.6930000185966492,
      "reward_std": 0.2701496481895447,
      "rewards/reward_func/mean": 0.6930000185966492,
      "rewards/reward_func/std": 0.4277917444705963,
      "step": 774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5501508843153715,
      "epoch": 1.513671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19080518186092377,
      "learning_rate": 9.263157894736844e-07,
      "loss": -0.0,
      "num_tokens": 3647880.0,
      "reward": 0.3418999910354614,
      "reward_std": 0.21971121430397034,
      "rewards/reward_func/mean": 0.3418999910354614,
      "rewards/reward_func/std": 0.4448350965976715,
      "step": 775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6878321021795273,
      "epoch": 1.515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24576818943023682,
      "learning_rate": 9.210526315789474e-07,
      "loss": -0.0,
      "num_tokens": 3652888.0,
      "reward": 0.35216251015663147,
      "reward_std": 0.5052483081817627,
      "rewards/reward_func/mean": 0.35216251015663147,
      "rewards/reward_func/std": 0.4830230474472046,
      "step": 776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8899596408009529,
      "epoch": 1.517578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2556166350841522,
      "learning_rate": 9.157894736842106e-07,
      "loss": 0.0,
      "num_tokens": 3657672.0,
      "reward": 0.7323875427246094,
      "reward_std": 0.2783551812171936,
      "rewards/reward_func/mean": 0.7323875427246094,
      "rewards/reward_func/std": 0.42598211765289307,
      "step": 777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6734718568623066,
      "epoch": 1.51953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22433032095432281,
      "learning_rate": 9.105263157894737e-07,
      "loss": 0.0,
      "num_tokens": 3662448.0,
      "reward": 0.4796375036239624,
      "reward_std": 0.5285993814468384,
      "rewards/reward_func/mean": 0.4796375036239624,
      "rewards/reward_func/std": 0.489534467458725,
      "step": 778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.47480286099016666,
      "epoch": 1.521484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20607703924179077,
      "learning_rate": 9.05263157894737e-07,
      "loss": -0.0,
      "num_tokens": 3667216.0,
      "reward": 0.3725624680519104,
      "reward_std": 0.5063344240188599,
      "rewards/reward_func/mean": 0.3725624680519104,
      "rewards/reward_func/std": 0.48398149013519287,
      "step": 779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5787655636668205,
      "epoch": 1.5234375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.19025181233882904,
      "learning_rate": 9.000000000000001e-07,
      "loss": 0.0,
      "num_tokens": 3672056.0,
      "reward": 0.11410000175237656,
      "reward_std": 0.2282000035047531,
      "rewards/reward_func/mean": 0.11410000175237656,
      "rewards/reward_func/std": 0.3227235674858093,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.0042685344815254,
      "epoch": 1.525390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2973874509334564,
      "learning_rate": 8.947368421052632e-07,
      "loss": 0.0,
      "num_tokens": 3676552.0,
      "reward": 0.874287486076355,
      "reward_std": 0.24427926540374756,
      "rewards/reward_func/mean": 0.874287486076355,
      "rewards/reward_func/std": 0.3382539749145508,
      "step": 781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7004695013165474,
      "epoch": 1.52734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24851390719413757,
      "learning_rate": 8.894736842105264e-07,
      "loss": -0.0,
      "num_tokens": 3680912.0,
      "reward": 0.8754374980926514,
      "reward_std": 0.23534303903579712,
      "rewards/reward_func/mean": 0.8754374980926514,
      "rewards/reward_func/std": 0.3189397156238556,
      "step": 782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6289892308413982,
      "epoch": 1.529296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3434109687805176,
      "learning_rate": 8.842105263157895e-07,
      "loss": 0.0,
      "num_tokens": 3685568.0,
      "reward": 0.4922749996185303,
      "reward_std": 0.4815645217895508,
      "rewards/reward_func/mean": 0.4922749996185303,
      "rewards/reward_func/std": 0.519862949848175,
      "step": 783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49262196384370327,
      "epoch": 1.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2120276689529419,
      "learning_rate": 8.789473684210527e-07,
      "loss": 0.0,
      "num_tokens": 3690344.0,
      "reward": 0.24681249260902405,
      "reward_std": 0.47292011976242065,
      "rewards/reward_func/mean": 0.24681249260902405,
      "rewards/reward_func/std": 0.4378432333469391,
      "step": 784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6302190236747265,
      "epoch": 1.533203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22514617443084717,
      "learning_rate": 8.736842105263159e-07,
      "loss": -0.0,
      "num_tokens": 3695136.0,
      "reward": 0.24524998664855957,
      "reward_std": 0.4741288423538208,
      "rewards/reward_func/mean": 0.24524998664855957,
      "rewards/reward_func/std": 0.43902352452278137,
      "step": 785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5339977368712425,
      "epoch": 1.53515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2869108319282532,
      "learning_rate": 8.68421052631579e-07,
      "loss": -0.0,
      "num_tokens": 3700064.0,
      "reward": 0.7380625009536743,
      "reward_std": 0.2860471308231354,
      "rewards/reward_func/mean": 0.7380625009536743,
      "rewards/reward_func/std": 0.45175832509994507,
      "step": 786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7544026896357536,
      "epoch": 1.537109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26479172706604004,
      "learning_rate": 8.631578947368421e-07,
      "loss": -0.0,
      "num_tokens": 3704800.0,
      "reward": 0.5,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5345224738121033,
      "step": 787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7368651237338781,
      "epoch": 1.5390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24209897220134735,
      "learning_rate": 8.578947368421054e-07,
      "loss": 0.0,
      "num_tokens": 3710008.0,
      "reward": 0.6328125,
      "reward_std": 0.529944896697998,
      "rewards/reward_func/mean": 0.6328125,
      "rewards/reward_func/std": 0.5071338415145874,
      "step": 788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7057337984442711,
      "epoch": 1.541015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20642440021038055,
      "learning_rate": 8.526315789473685e-07,
      "loss": -0.0,
      "num_tokens": 3714688.0,
      "reward": 0.8713124990463257,
      "reward_std": 0.25737500190734863,
      "rewards/reward_func/mean": 0.8713124990463257,
      "rewards/reward_func/std": 0.3473237454891205,
      "step": 789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6739199366420507,
      "epoch": 1.54296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2283017486333847,
      "learning_rate": 8.473684210526316e-07,
      "loss": 0.0,
      "num_tokens": 3719672.0,
      "reward": 0.3151249885559082,
      "reward_std": 0.4546101689338684,
      "rewards/reward_func/mean": 0.3151249885559082,
      "rewards/reward_func/std": 0.43122467398643494,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.37616512551903725,
      "epoch": 1.544921875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13778182864189148,
      "learning_rate": 8.421052631578948e-07,
      "loss": -0.0,
      "num_tokens": 3724448.0,
      "reward": 0.49831247329711914,
      "reward_std": 0.023593232035636902,
      "rewards/reward_func/mean": 0.49831250309944153,
      "rewards/reward_func/std": 0.4902626574039459,
      "step": 791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5549918822944164,
      "epoch": 1.546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22168178856372833,
      "learning_rate": 8.36842105263158e-07,
      "loss": 0.0,
      "num_tokens": 3729432.0,
      "reward": 0.12794999778270721,
      "reward_std": 0.22315937280654907,
      "rewards/reward_func/mean": 0.12794999778270721,
      "rewards/reward_func/std": 0.3018590807914734,
      "step": 792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.870604507625103,
      "epoch": 1.548828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22514410316944122,
      "learning_rate": 8.315789473684212e-07,
      "loss": 0.0,
      "num_tokens": 3733952.0,
      "reward": 0.9918999671936035,
      "reward_std": 0.011635387316346169,
      "rewards/reward_func/mean": 0.9918999671936035,
      "rewards/reward_func/std": 0.011179066263139248,
      "step": 793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.48082075640559196,
      "epoch": 1.55078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22919267416000366,
      "learning_rate": 8.263157894736843e-07,
      "loss": 0.0,
      "num_tokens": 3738736.0,
      "reward": 0.3589249849319458,
      "reward_std": 0.5067580938339233,
      "rewards/reward_func/mean": 0.3589249849319458,
      "rewards/reward_func/std": 0.4885357916355133,
      "step": 794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5452318023890257,
      "epoch": 1.552734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19861352443695068,
      "learning_rate": 8.210526315789474e-07,
      "loss": -0.0,
      "num_tokens": 3743696.0,
      "reward": 0.12037499994039536,
      "reward_std": 0.22721248865127563,
      "rewards/reward_func/mean": 0.12037499248981476,
      "rewards/reward_func/std": 0.31041398644447327,
      "step": 795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6286281645298004,
      "epoch": 1.5546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20838730037212372,
      "learning_rate": 8.157894736842106e-07,
      "loss": 0.0,
      "num_tokens": 3748664.0,
      "reward": 0.32600000500679016,
      "reward_std": 0.46434351801872253,
      "rewards/reward_func/mean": 0.32600000500679016,
      "rewards/reward_func/std": 0.45025455951690674,
      "step": 796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6037598215043545,
      "epoch": 1.556640625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18488991260528564,
      "learning_rate": 8.105263157894736e-07,
      "loss": -0.0,
      "num_tokens": 3753024.0,
      "reward": 0.754687488079071,
      "reward_std": 0.2833658754825592,
      "rewards/reward_func/mean": 0.754687488079071,
      "rewards/reward_func/std": 0.45434102416038513,
      "step": 797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7057324796915054,
      "epoch": 1.55859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18174698948860168,
      "learning_rate": 8.052631578947369e-07,
      "loss": 0.0,
      "num_tokens": 3757744.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5243270434439182,
      "epoch": 1.560546875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1930585652589798,
      "learning_rate": 8.000000000000001e-07,
      "loss": 0.0,
      "num_tokens": 3762088.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7073042988777161,
      "epoch": 1.5625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15252089500427246,
      "learning_rate": 7.947368421052632e-07,
      "loss": 0.0,
      "num_tokens": 3766704.0,
      "reward": 0.8503124713897705,
      "reward_std": 0.23873114585876465,
      "rewards/reward_func/mean": 0.8503124713897705,
      "rewards/reward_func/std": 0.33856281638145447,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4393083956092596,
      "epoch": 1.564453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21687999367713928,
      "learning_rate": 7.894736842105263e-07,
      "loss": 0.0,
      "num_tokens": 3771488.0,
      "reward": 0.4840624928474426,
      "reward_std": 0.5265330076217651,
      "rewards/reward_func/mean": 0.4840624928474426,
      "rewards/reward_func/std": 0.48748067021369934,
      "step": 801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7102243229746819,
      "epoch": 1.56640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2167195975780487,
      "learning_rate": 7.842105263157896e-07,
      "loss": 0.0,
      "num_tokens": 3776256.0,
      "reward": 0.7418375015258789,
      "reward_std": 0.4656052589416504,
      "rewards/reward_func/mean": 0.7418375015258789,
      "rewards/reward_func/std": 0.4318162798881531,
      "step": 802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.520836379379034,
      "epoch": 1.568359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23460784554481506,
      "learning_rate": 7.789473684210527e-07,
      "loss": -0.0,
      "num_tokens": 3781024.0,
      "reward": 0.3558874726295471,
      "reward_std": 0.5002304315567017,
      "rewards/reward_func/mean": 0.3558875024318695,
      "rewards/reward_func/std": 0.48085901141166687,
      "step": 803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5246933065354824,
      "epoch": 1.5703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20828157663345337,
      "learning_rate": 7.736842105263158e-07,
      "loss": 0.0,
      "num_tokens": 3785672.0,
      "reward": 0.49729999899864197,
      "reward_std": 0.4982522130012512,
      "rewards/reward_func/mean": 0.49729999899864197,
      "rewards/reward_func/std": 0.531683087348938,
      "step": 804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4413669481873512,
      "epoch": 1.572265625,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.684210526315789e-07,
      "loss": 0.0,
      "num_tokens": 3790176.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5796146206557751,
      "epoch": 1.57421875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.631578947368422e-07,
      "loss": 0.0,
      "num_tokens": 3794976.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.531171353533864,
      "epoch": 1.576171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22131916880607605,
      "learning_rate": 7.578947368421054e-07,
      "loss": -0.0,
      "num_tokens": 3799768.0,
      "reward": 0.37181249260902405,
      "reward_std": 0.5167224407196045,
      "rewards/reward_func/mean": 0.37181249260902405,
      "rewards/reward_func/std": 0.49653398990631104,
      "step": 807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6098550744354725,
      "epoch": 1.578125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1398448944091797,
      "learning_rate": 7.526315789473684e-07,
      "loss": 0.0,
      "num_tokens": 3804392.0,
      "reward": 0.7366249561309814,
      "reward_std": 0.2787158489227295,
      "rewards/reward_func/mean": 0.7366249561309814,
      "rewards/reward_func/std": 0.4469396471977234,
      "step": 808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.715061454102397,
      "epoch": 1.580078125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.23018530011177063,
      "learning_rate": 7.473684210526316e-07,
      "loss": 0.0,
      "num_tokens": 3809256.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6622244138270617,
      "epoch": 1.58203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19872058928012848,
      "learning_rate": 7.421052631578949e-07,
      "loss": -0.0,
      "num_tokens": 3814208.0,
      "reward": 0.015625,
      "reward_std": 0.02392766997218132,
      "rewards/reward_func/mean": 0.015625,
      "rewards/reward_func/std": 0.0265165064483881,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.41381765250116587,
      "epoch": 1.583984375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13804210722446442,
      "learning_rate": 7.368421052631579e-07,
      "loss": 0.0,
      "num_tokens": 3818696.0,
      "reward": 0.6972000002861023,
      "reward_std": 0.2683524191379547,
      "rewards/reward_func/mean": 0.6972000002861023,
      "rewards/reward_func/std": 0.43032118678092957,
      "step": 811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7343226224184036,
      "epoch": 1.5859375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.315789473684211e-07,
      "loss": 0.0,
      "num_tokens": 3823568.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5337802059948444,
      "epoch": 1.587890625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1350008249282837,
      "learning_rate": 7.263157894736843e-07,
      "loss": -0.0,
      "num_tokens": 3828216.0,
      "reward": 0.9947500228881836,
      "reward_std": 0.010500003583729267,
      "rewards/reward_func/mean": 0.9947500228881836,
      "rewards/reward_func/std": 0.014849240891635418,
      "step": 813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6469601839780807,
      "epoch": 1.58984375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13170914351940155,
      "learning_rate": 7.210526315789475e-07,
      "loss": 0.0,
      "num_tokens": 3833048.0,
      "reward": 0.11959999799728394,
      "reward_std": 0.23919998109340668,
      "rewards/reward_func/mean": 0.11959999799728394,
      "rewards/reward_func/std": 0.3382799029350281,
      "step": 814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6435871571302414,
      "epoch": 1.591796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21571120619773865,
      "learning_rate": 7.157894736842106e-07,
      "loss": -0.0,
      "num_tokens": 3837800.0,
      "reward": 0.7281500101089478,
      "reward_std": 0.4856353998184204,
      "rewards/reward_func/mean": 0.7281500101089478,
      "rewards/reward_func/std": 0.44976165890693665,
      "step": 815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7475415430963039,
      "epoch": 1.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22343456745147705,
      "learning_rate": 7.105263157894737e-07,
      "loss": -0.0,
      "num_tokens": 3842544.0,
      "reward": 0.7112499475479126,
      "reward_std": 0.45749998092651367,
      "rewards/reward_func/mean": 0.7112500071525574,
      "rewards/reward_func/std": 0.4237734377384186,
      "step": 816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7341870330274105,
      "epoch": 1.595703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23006655275821686,
      "learning_rate": 7.052631578947369e-07,
      "loss": 0.0,
      "num_tokens": 3846872.0,
      "reward": 0.6531250476837158,
      "reward_std": 0.49706268310546875,
      "rewards/reward_func/mean": 0.653124988079071,
      "rewards/reward_func/std": 0.4788728654384613,
      "step": 817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9006102904677391,
      "epoch": 1.59765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1597568690776825,
      "learning_rate": 7.000000000000001e-07,
      "loss": 0.0,
      "num_tokens": 3851368.0,
      "reward": 0.9973000288009644,
      "reward_std": 0.0054000020027160645,
      "rewards/reward_func/mean": 0.9973000288009644,
      "rewards/reward_func/std": 0.007636756170541048,
      "step": 818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6276929546147585,
      "epoch": 1.599609375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.947368421052631e-07,
      "loss": 0.0,
      "num_tokens": 3855840.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7189521789550781,
      "epoch": 1.6015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25209999084472656,
      "learning_rate": 6.894736842105264e-07,
      "loss": 0.0,
      "num_tokens": 3860840.0,
      "reward": 0.5924999713897705,
      "reward_std": 0.5084478855133057,
      "rewards/reward_func/mean": 0.5924999713897705,
      "rewards/reward_func/std": 0.49154552817344666,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.49646480195224285,
      "epoch": 1.603515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19341827929019928,
      "learning_rate": 6.842105263157896e-07,
      "loss": -0.0,
      "num_tokens": 3865624.0,
      "reward": 0.36006247997283936,
      "reward_std": 0.513949990272522,
      "rewards/reward_func/mean": 0.36006247997283936,
      "rewards/reward_func/std": 0.493501216173172,
      "step": 821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5097112730145454,
      "epoch": 1.60546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20900575816631317,
      "learning_rate": 6.789473684210526e-07,
      "loss": 0.0,
      "num_tokens": 3870608.0,
      "reward": 0.2334499955177307,
      "reward_std": 0.2624872624874115,
      "rewards/reward_func/mean": 0.2334500104188919,
      "rewards/reward_func/std": 0.4170190095901489,
      "step": 822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4433702193200588,
      "epoch": 1.607421875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.736842105263158e-07,
      "loss": 0.0,
      "num_tokens": 3875120.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7894489616155624,
      "epoch": 1.609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21178889274597168,
      "learning_rate": 6.684210526315791e-07,
      "loss": 0.0,
      "num_tokens": 3880120.0,
      "reward": 0.34780001640319824,
      "reward_std": 0.5050222873687744,
      "rewards/reward_func/mean": 0.34780001640319824,
      "rewards/reward_func/std": 0.4806229770183563,
      "step": 824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.39088401943445206,
      "epoch": 1.611328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24352553486824036,
      "learning_rate": 6.631578947368422e-07,
      "loss": 0.0,
      "num_tokens": 3885080.0,
      "reward": 0.3395000100135803,
      "reward_std": 0.49134349822998047,
      "rewards/reward_func/mean": 0.3395000100135803,
      "rewards/reward_func/std": 0.4688292443752289,
      "step": 825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 1.138112135231495,
      "epoch": 1.61328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2556565999984741,
      "learning_rate": 6.578947368421053e-07,
      "loss": 0.0,
      "num_tokens": 3889560.0,
      "reward": 0.5066750049591064,
      "reward_std": 0.5636498928070068,
      "rewards/reward_func/mean": 0.5066750049591064,
      "rewards/reward_func/std": 0.5218542218208313,
      "step": 826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7840673625469208,
      "epoch": 1.615234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21557015180587769,
      "learning_rate": 6.526315789473684e-07,
      "loss": 0.0,
      "num_tokens": 3894040.0,
      "reward": 0.7535499930381775,
      "reward_std": 0.4857522249221802,
      "rewards/reward_func/mean": 0.7535499930381775,
      "rewards/reward_func/std": 0.4499310553073883,
      "step": 827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5850768871605396,
      "epoch": 1.6171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24189525842666626,
      "learning_rate": 6.473684210526317e-07,
      "loss": -0.0,
      "num_tokens": 3898688.0,
      "reward": 0.6219249963760376,
      "reward_std": 0.526694655418396,
      "rewards/reward_func/mean": 0.6219249963760376,
      "rewards/reward_func/std": 0.508267343044281,
      "step": 828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6123648304492235,
      "epoch": 1.619140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2563243508338928,
      "learning_rate": 6.421052631578948e-07,
      "loss": 0.0,
      "num_tokens": 3903656.0,
      "reward": 0.125062495470047,
      "reward_std": 0.22389693558216095,
      "rewards/reward_func/mean": 0.1250625103712082,
      "rewards/reward_func/std": 0.3084697723388672,
      "step": 829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8369600027799606,
      "epoch": 1.62109375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17828819155693054,
      "learning_rate": 6.368421052631579e-07,
      "loss": 0.0,
      "num_tokens": 3908192.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.2814582586288452,
      "rewards/reward_func/mean": 0.7562500238418579,
      "rewards/reward_func/std": 0.4513373076915741,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.638560563325882,
      "epoch": 1.623046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19805623590946198,
      "learning_rate": 6.315789473684211e-07,
      "loss": 0.0,
      "num_tokens": 3912848.0,
      "reward": 0.5066750049591064,
      "reward_std": 0.5637478828430176,
      "rewards/reward_func/mean": 0.5066750049591064,
      "rewards/reward_func/std": 0.5220252871513367,
      "step": 831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5707159228622913,
      "epoch": 1.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18247583508491516,
      "learning_rate": 6.263157894736844e-07,
      "loss": -0.0,
      "num_tokens": 3917504.0,
      "reward": 0.3765625059604645,
      "reward_std": 0.5376508831977844,
      "rewards/reward_func/mean": 0.3765625059604645,
      "rewards/reward_func/std": 0.5162726044654846,
      "step": 832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7538819462060928,
      "epoch": 1.626953125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17551633715629578,
      "learning_rate": 6.210526315789474e-07,
      "loss": 0.0,
      "num_tokens": 3922512.0,
      "reward": 0.12229999899864197,
      "reward_std": 0.24459999799728394,
      "rewards/reward_func/mean": 0.12229999899864197,
      "rewards/reward_func/std": 0.34591662883758545,
      "step": 833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4829423278570175,
      "epoch": 1.62890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20213103294372559,
      "learning_rate": 6.157894736842106e-07,
      "loss": 0.0,
      "num_tokens": 3927296.0,
      "reward": 0.3590875267982483,
      "reward_std": 0.2541199028491974,
      "rewards/reward_func/mean": 0.3590874969959259,
      "rewards/reward_func/std": 0.4724847376346588,
      "step": 834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5105188228189945,
      "epoch": 1.630859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1484820693731308,
      "learning_rate": 6.105263157894738e-07,
      "loss": -0.0,
      "num_tokens": 3932224.0,
      "reward": 0.7462999820709229,
      "reward_std": 0.26800599694252014,
      "rewards/reward_func/mean": 0.7462999820709229,
      "rewards/reward_func/std": 0.4297657012939453,
      "step": 835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6433870941400528,
      "epoch": 1.6328125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24502220749855042,
      "learning_rate": 6.052631578947369e-07,
      "loss": 0.0,
      "num_tokens": 3936848.0,
      "reward": 0.5093749761581421,
      "reward_std": 0.5669463872909546,
      "rewards/reward_func/mean": 0.5093749761581421,
      "rewards/reward_func/std": 0.5250744223594666,
      "step": 836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4820468481630087,
      "epoch": 1.634765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1734929382801056,
      "learning_rate": 6.000000000000001e-07,
      "loss": 0.0,
      "num_tokens": 3941352.0,
      "reward": 0.8113000392913818,
      "reward_std": 0.23659999668598175,
      "rewards/reward_func/mean": 0.8113000392913818,
      "rewards/reward_func/std": 0.32786741852760315,
      "step": 837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5926277190446854,
      "epoch": 1.63671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2382938116788864,
      "learning_rate": 5.947368421052632e-07,
      "loss": -0.0,
      "num_tokens": 3946144.0,
      "reward": 0.7357000112533569,
      "reward_std": 0.2871914803981781,
      "rewards/reward_func/mean": 0.7357000112533569,
      "rewards/reward_func/std": 0.45415768027305603,
      "step": 838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7808935679495335,
      "epoch": 1.638671875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1889333575963974,
      "learning_rate": 5.894736842105263e-07,
      "loss": 0.0,
      "num_tokens": 3950680.0,
      "reward": 0.7504249811172485,
      "reward_std": 0.28198346495628357,
      "rewards/reward_func/mean": 0.7504249811172485,
      "rewards/reward_func/std": 0.4555181860923767,
      "step": 839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8180569782853127,
      "epoch": 1.640625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17319242656230927,
      "learning_rate": 5.842105263157896e-07,
      "loss": 0.0,
      "num_tokens": 3955160.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.2814582586288452,
      "rewards/reward_func/mean": 0.7562500238418579,
      "rewards/reward_func/std": 0.4513373076915741,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4921247735619545,
      "epoch": 1.642578125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.789473684210526e-07,
      "loss": 0.0,
      "num_tokens": 3959648.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6009175051003695,
      "epoch": 1.64453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21017959713935852,
      "learning_rate": 5.736842105263158e-07,
      "loss": -0.0,
      "num_tokens": 3964288.0,
      "reward": 0.49517500400543213,
      "reward_std": 0.4928368628025055,
      "rewards/reward_func/mean": 0.49517500400543213,
      "rewards/reward_func/std": 0.5229134559631348,
      "step": 842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6235355269163847,
      "epoch": 1.646484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2219318151473999,
      "learning_rate": 5.68421052631579e-07,
      "loss": 0.0,
      "num_tokens": 3969248.0,
      "reward": 0.3330000042915344,
      "reward_std": 0.47834351658821106,
      "rewards/reward_func/mean": 0.3330000042915344,
      "rewards/reward_func/std": 0.45958366990089417,
      "step": 843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.44258963875472546,
      "epoch": 1.6484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21175144612789154,
      "learning_rate": 5.631578947368421e-07,
      "loss": -0.0,
      "num_tokens": 3974224.0,
      "reward": 0.12037499994039536,
      "reward_std": 0.2241630107164383,
      "rewards/reward_func/mean": 0.12037499994039536,
      "rewards/reward_func/std": 0.31041398644447327,
      "step": 844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.691203698515892,
      "epoch": 1.650390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2612920105457306,
      "learning_rate": 5.578947368421053e-07,
      "loss": 0.0,
      "num_tokens": 3978880.0,
      "reward": 0.6223000288009644,
      "reward_std": 0.536927342414856,
      "rewards/reward_func/mean": 0.6223000288009644,
      "rewards/reward_func/std": 0.5153651237487793,
      "step": 845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5455706492066383,
      "epoch": 1.65234375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21940505504608154,
      "learning_rate": 5.526315789473684e-07,
      "loss": 0.0,
      "num_tokens": 3983728.0,
      "reward": 0.49701249599456787,
      "reward_std": 0.5561558604240417,
      "rewards/reward_func/mean": 0.49701249599456787,
      "rewards/reward_func/std": 0.5150313377380371,
      "step": 846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5260077305138111,
      "epoch": 1.654296875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.473684210526316e-07,
      "loss": 0.0,
      "num_tokens": 3988360.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8520932123064995,
      "epoch": 1.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2201906442642212,
      "learning_rate": 5.421052631578948e-07,
      "loss": 0.0,
      "num_tokens": 3993352.0,
      "reward": 0.2501250207424164,
      "reward_std": 0.2676456868648529,
      "rewards/reward_func/mean": 0.2501250207424164,
      "rewards/reward_func/std": 0.39523282647132874,
      "step": 848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5580763444304466,
      "epoch": 1.658203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20035962760448456,
      "learning_rate": 5.368421052631579e-07,
      "loss": -0.0,
      "num_tokens": 3998008.0,
      "reward": 0.3670499920845032,
      "reward_std": 0.5273429155349731,
      "rewards/reward_func/mean": 0.3670499920845032,
      "rewards/reward_func/std": 0.5067015290260315,
      "step": 849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.46416748873889446,
      "epoch": 1.66015625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15010768175125122,
      "learning_rate": 5.315789473684211e-07,
      "loss": -0.0,
      "num_tokens": 4002520.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6008065789937973,
      "epoch": 1.662109375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18306009471416473,
      "learning_rate": 5.263157894736843e-07,
      "loss": -0.0,
      "num_tokens": 4007264.0,
      "reward": 0.8224999904632568,
      "reward_std": 0.23499999940395355,
      "rewards/reward_func/mean": 0.8224999904632568,
      "rewards/reward_func/std": 0.33234018087387085,
      "step": 851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.45169725827872753,
      "epoch": 1.6640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20183001458644867,
      "learning_rate": 5.210526315789474e-07,
      "loss": 0.0,
      "num_tokens": 4012224.0,
      "reward": 0.452875018119812,
      "reward_std": 0.5022744536399841,
      "rewards/reward_func/mean": 0.452875018119812,
      "rewards/reward_func/std": 0.46521246433258057,
      "step": 852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5721710417419672,
      "epoch": 1.666015625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21509496867656708,
      "learning_rate": 5.157894736842106e-07,
      "loss": -0.0,
      "num_tokens": 4017208.0,
      "reward": 0.3440000116825104,
      "reward_std": 0.4370303750038147,
      "rewards/reward_func/mean": 0.3440000116825104,
      "rewards/reward_func/std": 0.42025572061538696,
      "step": 853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7655346915125847,
      "epoch": 1.66796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27250996232032776,
      "learning_rate": 5.105263157894738e-07,
      "loss": -0.0,
      "num_tokens": 4021728.0,
      "reward": 0.6143499612808228,
      "reward_std": 0.25468018651008606,
      "rewards/reward_func/mean": 0.6143499612808228,
      "rewards/reward_func/std": 0.5089049935340881,
      "step": 854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8787706345319748,
      "epoch": 1.669921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.31136733293533325,
      "learning_rate": 5.052631578947369e-07,
      "loss": 0.0,
      "num_tokens": 4026216.0,
      "reward": 0.8695999979972839,
      "reward_std": 0.2536522150039673,
      "rewards/reward_func/mean": 0.8695999979972839,
      "rewards/reward_func/std": 0.35150691866874695,
      "step": 855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5140975881367922,
      "epoch": 1.671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2465497851371765,
      "learning_rate": 5.000000000000001e-07,
      "loss": -0.0,
      "num_tokens": 4031008.0,
      "reward": 0.4752500057220459,
      "reward_std": 0.5488084554672241,
      "rewards/reward_func/mean": 0.4752500057220459,
      "rewards/reward_func/std": 0.5081146359443665,
      "step": 856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6999778598546982,
      "epoch": 1.673828125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 4.947368421052632e-07,
      "loss": 0.0,
      "num_tokens": 4036000.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4997900687158108,
      "epoch": 1.67578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21455521881580353,
      "learning_rate": 4.894736842105263e-07,
      "loss": -0.0,
      "num_tokens": 4040968.0,
      "reward": 0.23787499964237213,
      "reward_std": 0.4509041905403137,
      "rewards/reward_func/mean": 0.23787501454353333,
      "rewards/reward_func/std": 0.41770508885383606,
      "step": 858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5979567542672157,
      "epoch": 1.677734375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17116230726242065,
      "learning_rate": 4.842105263157895e-07,
      "loss": 0.0,
      "num_tokens": 4045632.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8094654753804207,
      "epoch": 1.6796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2224608212709427,
      "learning_rate": 4.789473684210526e-07,
      "loss": -0.0,
      "num_tokens": 4050216.0,
      "reward": 0.7447500228881836,
      "reward_std": 0.2991751432418823,
      "rewards/reward_func/mean": 0.7447500228881836,
      "rewards/reward_func/std": 0.4598980247974396,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6717873178422451,
      "epoch": 1.681640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24959073960781097,
      "learning_rate": 4.7368421052631585e-07,
      "loss": 0.0,
      "num_tokens": 4054856.0,
      "reward": 0.625,
      "reward_std": 0.5386751294136047,
      "rewards/reward_func/mean": 0.625,
      "rewards/reward_func/std": 0.5175492167472839,
      "step": 861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6693816427141428,
      "epoch": 1.68359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1378486603498459,
      "learning_rate": 4.6842105263157896e-07,
      "loss": 0.0,
      "num_tokens": 4059712.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4828580655157566,
      "epoch": 1.685546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19800472259521484,
      "learning_rate": 4.631578947368422e-07,
      "loss": 0.0,
      "num_tokens": 4064496.0,
      "reward": 0.13242501020431519,
      "reward_std": 0.24046964943408966,
      "rewards/reward_func/mean": 0.132424995303154,
      "rewards/reward_func/std": 0.32472512125968933,
      "step": 863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7691012695431709,
      "epoch": 1.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22753512859344482,
      "learning_rate": 4.578947368421053e-07,
      "loss": 0.0,
      "num_tokens": 4069032.0,
      "reward": 0.6379250288009644,
      "reward_std": 0.5109583139419556,
      "rewards/reward_func/mean": 0.6379250288009644,
      "rewards/reward_func/std": 0.4939082860946655,
      "step": 864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6634176559746265,
      "epoch": 1.689453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24916769564151764,
      "learning_rate": 4.526315789473685e-07,
      "loss": -0.0,
      "num_tokens": 4074024.0,
      "reward": 0.23919999599456787,
      "reward_std": 0.47839999198913574,
      "rewards/reward_func/mean": 0.23919999599456787,
      "rewards/reward_func/std": 0.44306278228759766,
      "step": 865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6947410106658936,
      "epoch": 1.69140625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2586158812046051,
      "learning_rate": 4.473684210526316e-07,
      "loss": 0.0,
      "num_tokens": 4078528.0,
      "reward": 0.7488625049591064,
      "reward_std": 0.29228225350379944,
      "rewards/reward_func/mean": 0.7488625049591064,
      "rewards/reward_func/std": 0.4584231972694397,
      "step": 866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.38118401169776917,
      "epoch": 1.693359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20857468247413635,
      "learning_rate": 4.421052631578947e-07,
      "loss": 0.0,
      "num_tokens": 4083312.0,
      "reward": 0.2487500011920929,
      "reward_std": 0.46485579013824463,
      "rewards/reward_func/mean": 0.2487500011920929,
      "rewards/reward_func/std": 0.4304842948913574,
      "step": 867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6276389136910439,
      "epoch": 1.6953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23110730946063995,
      "learning_rate": 4.3684210526315794e-07,
      "loss": 0.0,
      "num_tokens": 4087952.0,
      "reward": 0.75,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5682788696140051,
      "epoch": 1.697265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.230777308344841,
      "learning_rate": 4.3157894736842105e-07,
      "loss": -0.0,
      "num_tokens": 4092624.0,
      "reward": 0.5,
      "reward_std": 0.5773502588272095,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5345224738121033,
      "step": 869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5546636991202831,
      "epoch": 1.69921875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15263542532920837,
      "learning_rate": 4.2631578947368427e-07,
      "loss": 0.0,
      "num_tokens": 4096984.0,
      "reward": 0.8843749761581421,
      "reward_std": 0.23125000298023224,
      "rewards/reward_func/mean": 0.8843749761581421,
      "rewards/reward_func/std": 0.32703688740730286,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.375160763040185,
      "epoch": 1.701171875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18006189167499542,
      "learning_rate": 4.210526315789474e-07,
      "loss": 0.0,
      "num_tokens": 4101480.0,
      "reward": 0.46480000019073486,
      "reward_std": 0.46480000019073486,
      "rewards/reward_func/mean": 0.46480000019073486,
      "rewards/reward_func/std": 0.49689212441444397,
      "step": 871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.780842486768961,
      "epoch": 1.703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2666405439376831,
      "learning_rate": 4.157894736842106e-07,
      "loss": -0.0,
      "num_tokens": 4106272.0,
      "reward": 0.7378374934196472,
      "reward_std": 0.4632420837879181,
      "rewards/reward_func/mean": 0.7378374934196472,
      "rewards/reward_func/std": 0.429521769285202,
      "step": 872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.480727631598711,
      "epoch": 1.705078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2148435115814209,
      "learning_rate": 4.105263157894737e-07,
      "loss": 0.0,
      "num_tokens": 4111048.0,
      "reward": 0.2475000023841858,
      "reward_std": 0.2923952341079712,
      "rewards/reward_func/mean": 0.2475000023841858,
      "rewards/reward_func/std": 0.42844611406326294,
      "step": 873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4629284869879484,
      "epoch": 1.70703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14672014117240906,
      "learning_rate": 4.052631578947368e-07,
      "loss": 0.0,
      "num_tokens": 4115824.0,
      "reward": 0.24056249856948853,
      "reward_std": 0.274181067943573,
      "rewards/reward_func/mean": 0.24056249856948853,
      "rewards/reward_func/std": 0.44159868359565735,
      "step": 874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5807988476008177,
      "epoch": 1.708984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19631755352020264,
      "learning_rate": 4.0000000000000003e-07,
      "loss": -0.0,
      "num_tokens": 4120472.0,
      "reward": 0.25,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.25,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5514065846800804,
      "epoch": 1.7109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2597048282623291,
      "learning_rate": 3.9473684210526315e-07,
      "loss": 0.0,
      "num_tokens": 4124832.0,
      "reward": 0.768750011920929,
      "reward_std": 0.4625000059604645,
      "rewards/reward_func/mean": 0.768750011920929,
      "rewards/reward_func/std": 0.42819181084632874,
      "step": 876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6845806501805782,
      "epoch": 1.712890625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1643269956111908,
      "learning_rate": 3.8947368421052636e-07,
      "loss": 0.0,
      "num_tokens": 4129696.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.433909023180604,
      "epoch": 1.71484375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18823891878128052,
      "learning_rate": 3.8421052631578947e-07,
      "loss": 0.0,
      "num_tokens": 4134664.0,
      "reward": 0.11256249994039536,
      "reward_std": 0.2251249998807907,
      "rewards/reward_func/mean": 0.11256249994039536,
      "rewards/reward_func/std": 0.31335461139678955,
      "step": 878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5910208597779274,
      "epoch": 1.716796875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22284384071826935,
      "learning_rate": 3.789473684210527e-07,
      "loss": 0.0,
      "num_tokens": 4139456.0,
      "reward": 0.6029999852180481,
      "reward_std": 0.5188616514205933,
      "rewards/reward_func/mean": 0.6029999852180481,
      "rewards/reward_func/std": 0.4995529353618622,
      "step": 879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8369920663535595,
      "epoch": 1.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2716849446296692,
      "learning_rate": 3.736842105263158e-07,
      "loss": 0.0,
      "num_tokens": 4144216.0,
      "reward": 0.6250250339508057,
      "reward_std": 0.5000154972076416,
      "rewards/reward_func/mean": 0.6250250339508057,
      "rewards/reward_func/std": 0.4835914373397827,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5901597924530506,
      "epoch": 1.720703125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15631148219108582,
      "learning_rate": 3.6842105263157896e-07,
      "loss": -0.0,
      "num_tokens": 4148832.0,
      "reward": 0.8573124408721924,
      "reward_std": 0.24137499928474426,
      "rewards/reward_func/mean": 0.8573124408721924,
      "rewards/reward_func/std": 0.3413558006286621,
      "step": 881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.618029011413455,
      "epoch": 1.72265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2383870780467987,
      "learning_rate": 3.6315789473684213e-07,
      "loss": 0.0,
      "num_tokens": 4153624.0,
      "reward": 0.5831999778747559,
      "reward_std": 0.5035399198532104,
      "rewards/reward_func/mean": 0.5831999778747559,
      "rewards/reward_func/std": 0.4830016791820526,
      "step": 882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.741577934473753,
      "epoch": 1.724609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26154232025146484,
      "learning_rate": 3.578947368421053e-07,
      "loss": 0.0,
      "num_tokens": 4158272.0,
      "reward": 0.5,
      "reward_std": 0.5,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5345224738121033,
      "step": 883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6781878937035799,
      "epoch": 1.7265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2297692745923996,
      "learning_rate": 3.5263157894736846e-07,
      "loss": -0.0,
      "num_tokens": 4163288.0,
      "reward": 0.45639997720718384,
      "reward_std": 0.5271494388580322,
      "rewards/reward_func/mean": 0.45639997720718384,
      "rewards/reward_func/std": 0.48832178115844727,
      "step": 884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.770763710141182,
      "epoch": 1.728515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24367251992225647,
      "learning_rate": 3.4736842105263157e-07,
      "loss": -0.0,
      "num_tokens": 4168048.0,
      "reward": 0.49502497911453247,
      "reward_std": 0.5504268407821655,
      "rewards/reward_func/mean": 0.49502500891685486,
      "rewards/reward_func/std": 0.5097962021827698,
      "step": 885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.757527232170105,
      "epoch": 1.73046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21755702793598175,
      "learning_rate": 3.421052631578948e-07,
      "loss": -0.0,
      "num_tokens": 4172808.0,
      "reward": 0.6312999725341797,
      "reward_std": 0.24971508979797363,
      "rewards/reward_func/mean": 0.6312999725341797,
      "rewards/reward_func/std": 0.4959184527397156,
      "step": 886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5246509797871113,
      "epoch": 1.732421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2858599126338959,
      "learning_rate": 3.368421052631579e-07,
      "loss": -0.0,
      "num_tokens": 4177456.0,
      "reward": 0.4972499907016754,
      "reward_std": 0.498220831155777,
      "rewards/reward_func/mean": 0.4972499907016754,
      "rewards/reward_func/std": 0.5316314101219177,
      "step": 887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.788298238068819,
      "epoch": 1.734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24566468596458435,
      "learning_rate": 3.315789473684211e-07,
      "loss": 0.0,
      "num_tokens": 4182456.0,
      "reward": 0.3424000144004822,
      "reward_std": 0.4942222833633423,
      "rewards/reward_func/mean": 0.3424000144004822,
      "rewards/reward_func/std": 0.472703218460083,
      "step": 888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.625529520213604,
      "epoch": 1.736328125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 3.263157894736842e-07,
      "loss": 0.0,
      "num_tokens": 4187440.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.0,
      "rewards/reward_func/std": 0.0,
      "step": 889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5419940818101168,
      "epoch": 1.73828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24041259288787842,
      "learning_rate": 3.210526315789474e-07,
      "loss": -0.0,
      "num_tokens": 4192360.0,
      "reward": 0.5180374979972839,
      "reward_std": 0.4634375274181366,
      "rewards/reward_func/mean": 0.5180374979972839,
      "rewards/reward_func/std": 0.5047063231468201,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7481512352824211,
      "epoch": 1.740234375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16213132441043854,
      "learning_rate": 3.1578947368421055e-07,
      "loss": 0.0,
      "num_tokens": 4196952.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8599461615085602,
      "epoch": 1.7421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2375580370426178,
      "learning_rate": 3.105263157894737e-07,
      "loss": -0.0,
      "num_tokens": 4201568.0,
      "reward": 0.8697500228881836,
      "reward_std": 0.2605000138282776,
      "rewards/reward_func/mean": 0.8697500228881836,
      "rewards/reward_func/std": 0.3517392575740814,
      "step": 892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8133350238204002,
      "epoch": 1.744140625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.21218901872634888,
      "learning_rate": 3.052631578947369e-07,
      "loss": 0.0,
      "num_tokens": 4206320.0,
      "reward": 0.8271875381469727,
      "reward_std": 0.22562499344348907,
      "rewards/reward_func/mean": 0.8271875381469727,
      "rewards/reward_func/std": 0.3190819323062897,
      "step": 893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.44859400019049644,
      "epoch": 1.74609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18427495658397675,
      "learning_rate": 3.0000000000000004e-07,
      "loss": 0.0,
      "num_tokens": 4210984.0,
      "reward": 0.38631248474121094,
      "reward_std": 0.23885026574134827,
      "rewards/reward_func/mean": 0.38631248474121094,
      "rewards/reward_func/std": 0.5027945637702942,
      "step": 894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6991079337894917,
      "epoch": 1.748046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2558084726333618,
      "learning_rate": 2.9473684210526315e-07,
      "loss": 0.0,
      "num_tokens": 4215472.0,
      "reward": 0.628125011920929,
      "reward_std": 0.5337572693824768,
      "rewards/reward_func/mean": 0.628125011920929,
      "rewards/reward_func/std": 0.5132507681846619,
      "step": 895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8943945094943047,
      "epoch": 1.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24125763773918152,
      "learning_rate": 2.894736842105263e-07,
      "loss": 0.0,
      "num_tokens": 4219952.0,
      "reward": 0.5051125288009644,
      "reward_std": 0.5653034448623657,
      "rewards/reward_func/mean": 0.5051125288009644,
      "rewards/reward_func/std": 0.5234332084655762,
      "step": 896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6643506474792957,
      "epoch": 1.751953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19916725158691406,
      "learning_rate": 2.842105263157895e-07,
      "loss": 0.0,
      "num_tokens": 4224704.0,
      "reward": 0.8607000112533569,
      "reward_std": 0.2525746822357178,
      "rewards/reward_func/mean": 0.8607000112533569,
      "rewards/reward_func/std": 0.34793561697006226,
      "step": 897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6863343082368374,
      "epoch": 1.75390625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18417027592658997,
      "learning_rate": 2.7894736842105264e-07,
      "loss": -0.0,
      "num_tokens": 4229464.0,
      "reward": 0.7471874952316284,
      "reward_std": 0.2749817669391632,
      "rewards/reward_func/mean": 0.7471874952316284,
      "rewards/reward_func/std": 0.45018935203552246,
      "step": 898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.703750491142273,
      "epoch": 1.755859375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2679993808269501,
      "learning_rate": 2.736842105263158e-07,
      "loss": 0.0,
      "num_tokens": 4234464.0,
      "reward": 0.33969998359680176,
      "reward_std": 0.4910672605037689,
      "rewards/reward_func/mean": 0.33969998359680176,
      "rewards/reward_func/std": 0.46902716159820557,
      "step": 899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8419626951217651,
      "epoch": 1.7578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28019896149635315,
      "learning_rate": 2.6842105263157897e-07,
      "loss": 0.0,
      "num_tokens": 4238976.0,
      "reward": 0.7445999979972839,
      "reward_std": 0.49645256996154785,
      "rewards/reward_func/mean": 0.7445999979972839,
      "rewards/reward_func/std": 0.45967379212379456,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8848395217210054,
      "epoch": 1.759765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2501536011695862,
      "learning_rate": 2.6315789473684213e-07,
      "loss": 0.0,
      "num_tokens": 4243728.0,
      "reward": 0.73808753490448,
      "reward_std": 0.4636053442955017,
      "rewards/reward_func/mean": 0.7380874752998352,
      "rewards/reward_func/std": 0.4300731420516968,
      "step": 901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5995431952178478,
      "epoch": 1.76171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1700550615787506,
      "learning_rate": 2.578947368421053e-07,
      "loss": -0.0,
      "num_tokens": 4248720.0,
      "reward": 0.2184000015258789,
      "reward_std": 0.2522551119327545,
      "rewards/reward_func/mean": 0.2184000015258789,
      "rewards/reward_func/std": 0.4044714868068695,
      "step": 902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8698443882167339,
      "epoch": 1.763671875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2777104675769806,
      "learning_rate": 2.5263157894736846e-07,
      "loss": -0.0,
      "num_tokens": 4253256.0,
      "reward": 0.504687488079071,
      "reward_std": 0.5720410346984863,
      "rewards/reward_func/mean": 0.504687488079071,
      "rewards/reward_func/std": 0.5296536087989807,
      "step": 903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7046304792165756,
      "epoch": 1.765625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23841030895709991,
      "learning_rate": 2.473684210526316e-07,
      "loss": 0.0,
      "num_tokens": 4258024.0,
      "reward": 0.6187999844551086,
      "reward_std": 0.5329432487487793,
      "rewards/reward_func/mean": 0.6187999844551086,
      "rewards/reward_func/std": 0.5125207901000977,
      "step": 904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7159217670559883,
      "epoch": 1.767578125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2175939530134201,
      "learning_rate": 2.4210526315789473e-07,
      "loss": 0.0,
      "num_tokens": 4263024.0,
      "reward": 0.3587000072002411,
      "reward_std": 0.5176094770431519,
      "rewards/reward_func/mean": 0.3587000072002411,
      "rewards/reward_func/std": 0.4951927065849304,
      "step": 905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4197417329996824,
      "epoch": 1.76953125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18832392990589142,
      "learning_rate": 2.3684210526315792e-07,
      "loss": 0.0,
      "num_tokens": 4267968.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.014433757402002811,
      "rewards/reward_func/mean": 0.012500000186264515,
      "rewards/reward_func/std": 0.013363063335418701,
      "step": 906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7613006345927715,
      "epoch": 1.771484375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17875666916370392,
      "learning_rate": 2.315789473684211e-07,
      "loss": -0.0,
      "num_tokens": 4272328.0,
      "reward": 0.9947500228881836,
      "reward_std": 0.010500003583729267,
      "rewards/reward_func/mean": 0.9947500228881836,
      "rewards/reward_func/std": 0.014849240891635418,
      "step": 907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7562070265412331,
      "epoch": 1.7734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2620117664337158,
      "learning_rate": 2.2631578947368425e-07,
      "loss": 0.0,
      "num_tokens": 4276768.0,
      "reward": 0.6265624761581421,
      "reward_std": 0.5368822813034058,
      "rewards/reward_func/mean": 0.6265624761581421,
      "rewards/reward_func/std": 0.515407145023346,
      "step": 908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.735223077237606,
      "epoch": 1.775390625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24371092021465302,
      "learning_rate": 2.2105263157894736e-07,
      "loss": -0.0,
      "num_tokens": 4281480.0,
      "reward": 0.9846999645233154,
      "reward_std": 0.03060000017285347,
      "rewards/reward_func/mean": 0.9846999645233154,
      "rewards/reward_func/std": 0.030131708830595016,
      "step": 909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.888181284070015,
      "epoch": 1.77734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2600753605365753,
      "learning_rate": 2.1578947368421053e-07,
      "loss": 0.0,
      "num_tokens": 4285840.0,
      "reward": 0.5,
      "reward_std": 0.5773502588272095,
      "rewards/reward_func/mean": 0.5,
      "rewards/reward_func/std": 0.5345224738121033,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6173073314130306,
      "epoch": 1.779296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22857871651649475,
      "learning_rate": 2.105263157894737e-07,
      "loss": -0.0,
      "num_tokens": 4290496.0,
      "reward": 0.37229999899864197,
      "reward_std": 0.5355914831161499,
      "rewards/reward_func/mean": 0.37229999899864197,
      "rewards/reward_func/std": 0.5138660669326782,
      "step": 911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5189631078392267,
      "epoch": 1.78125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14599941670894623,
      "learning_rate": 2.0526315789473685e-07,
      "loss": -0.0,
      "num_tokens": 4295008.0,
      "reward": 0.8134000301361084,
      "reward_std": 0.23240000009536743,
      "rewards/reward_func/mean": 0.8134000301361084,
      "rewards/reward_func/std": 0.3286632299423218,
      "step": 912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8033823296427727,
      "epoch": 1.783203125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18053551018238068,
      "learning_rate": 2.0000000000000002e-07,
      "loss": 0.0,
      "num_tokens": 4299768.0,
      "reward": 0.8362500071525574,
      "reward_std": 0.22793914377689362,
      "rewards/reward_func/mean": 0.8362500071525574,
      "rewards/reward_func/std": 0.3183859884738922,
      "step": 913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6970736794173717,
      "epoch": 1.78515625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.16842643916606903,
      "learning_rate": 1.9473684210526318e-07,
      "loss": -0.0,
      "num_tokens": 4304520.0,
      "reward": 0.8224999904632568,
      "reward_std": 0.23499999940395355,
      "rewards/reward_func/mean": 0.8224999904632568,
      "rewards/reward_func/std": 0.33234018087387085,
      "step": 914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6607658788561821,
      "epoch": 1.787109375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22088974714279175,
      "learning_rate": 1.8947368421052634e-07,
      "loss": 0.0,
      "num_tokens": 4309296.0,
      "reward": 0.7371000051498413,
      "reward_std": 0.28240811824798584,
      "rewards/reward_func/mean": 0.7371000051498413,
      "rewards/reward_func/std": 0.4553808271884918,
      "step": 915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6025678031146526,
      "epoch": 1.7890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2665260136127472,
      "learning_rate": 1.8421052631578948e-07,
      "loss": 0.0,
      "num_tokens": 4314088.0,
      "reward": 0.8602499961853027,
      "reward_std": 0.24355539679527283,
      "rewards/reward_func/mean": 0.8602499961853027,
      "rewards/reward_func/std": 0.32780489325523376,
      "step": 916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.421773798763752,
      "epoch": 1.791015625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12458179891109467,
      "learning_rate": 1.7894736842105265e-07,
      "loss": 0.0,
      "num_tokens": 4319040.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/reward_func/mean": 0.0031250000465661287,
      "rewards/reward_func/std": 0.008838835172355175,
      "step": 917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8600736558437347,
      "epoch": 1.79296875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2437726855278015,
      "learning_rate": 1.7368421052631578e-07,
      "loss": -0.0,
      "num_tokens": 4323544.0,
      "reward": 0.5066750049591064,
      "reward_std": 0.4920022189617157,
      "rewards/reward_func/mean": 0.5066750049591064,
      "rewards/reward_func/std": 0.5217258930206299,
      "step": 918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7552273385226727,
      "epoch": 1.794921875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23656296730041504,
      "learning_rate": 1.6842105263157895e-07,
      "loss": -0.0,
      "num_tokens": 4328408.0,
      "reward": 0.8660625219345093,
      "reward_std": 0.2540762424468994,
      "rewards/reward_func/mean": 0.8660625219345093,
      "rewards/reward_func/std": 0.34541285037994385,
      "step": 919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6802506037056446,
      "epoch": 1.796875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1381627321243286,
      "learning_rate": 1.631578947368421e-07,
      "loss": 0.0,
      "num_tokens": 4333080.0,
      "reward": 0.875,
      "reward_std": 0.25,
      "rewards/reward_func/mean": 0.875,
      "rewards/reward_func/std": 0.3535533845424652,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.19835966220125556,
      "epoch": 1.798828125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.5789473684210527e-07,
      "loss": 0.0,
      "num_tokens": 4337592.0,
      "reward": 0.9296000003814697,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 0.9296000003814697,
      "rewards/reward_func/std": 0.0,
      "step": 921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8462485000491142,
      "epoch": 1.80078125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24740563333034515,
      "learning_rate": 1.5263157894736844e-07,
      "loss": 0.0,
      "num_tokens": 4342608.0,
      "reward": 0.47290000319480896,
      "reward_std": 0.5461318492889404,
      "rewards/reward_func/mean": 0.47290000319480896,
      "rewards/reward_func/std": 0.5058882236480713,
      "step": 922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4742927271872759,
      "epoch": 1.802734375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.2252196073532104,
      "learning_rate": 1.4736842105263158e-07,
      "loss": 0.0,
      "num_tokens": 4347248.0,
      "reward": 0.3668999969959259,
      "reward_std": 0.5270397663116455,
      "rewards/reward_func/mean": 0.3668999969959259,
      "rewards/reward_func/std": 0.5063701272010803,
      "step": 923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6519879586994648,
      "epoch": 1.8046875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25893595814704895,
      "learning_rate": 1.4210526315789474e-07,
      "loss": -0.0,
      "num_tokens": 4352264.0,
      "reward": 0.23256249725818634,
      "reward_std": 0.4609765410423279,
      "rewards/reward_func/mean": 0.23256249725818634,
      "rewards/reward_func/std": 0.4271462559700012,
      "step": 924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8685908392071724,
      "epoch": 1.806640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24174730479717255,
      "learning_rate": 1.368421052631579e-07,
      "loss": -0.0,
      "num_tokens": 4356792.0,
      "reward": 0.6343749761581421,
      "reward_std": 0.5239908695220947,
      "rewards/reward_func/mean": 0.6343749761581421,
      "rewards/reward_func/std": 0.5047431588172913,
      "step": 925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.584613062441349,
      "epoch": 1.80859375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15689092874526978,
      "learning_rate": 1.3157894736842107e-07,
      "loss": 0.0,
      "num_tokens": 4361448.0,
      "reward": 0.760937511920929,
      "reward_std": 0.27662280201911926,
      "rewards/reward_func/mean": 0.760937511920929,
      "rewards/reward_func/std": 0.4432750344276428,
      "step": 926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6101187616586685,
      "epoch": 1.810546875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17513932287693024,
      "learning_rate": 1.2631578947368423e-07,
      "loss": 0.0,
      "num_tokens": 4366136.0,
      "reward": 0.75,
      "reward_std": 0.28867512941360474,
      "rewards/reward_func/mean": 0.75,
      "rewards/reward_func/std": 0.4629100561141968,
      "step": 927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6578428782522678,
      "epoch": 1.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19013863801956177,
      "learning_rate": 1.2105263157894737e-07,
      "loss": 0.0,
      "num_tokens": 4371112.0,
      "reward": 0.21320000290870667,
      "reward_std": 0.42640000581741333,
      "rewards/reward_func/mean": 0.21320000290870667,
      "rewards/reward_func/std": 0.3952178359031677,
      "step": 928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.505165308713913,
      "epoch": 1.814453125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9573839902877808,
      "learning_rate": 1.1578947368421054e-07,
      "loss": -0.0,
      "num_tokens": 4375760.0,
      "reward": 0.6168624758720398,
      "reward_std": 0.5261497497558594,
      "rewards/reward_func/mean": 0.6168625354766846,
      "rewards/reward_func/std": 0.5074918866157532,
      "step": 929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.874191090464592,
      "epoch": 1.81640625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2154332995414734,
      "learning_rate": 1.1052631578947368e-07,
      "loss": 0.0,
      "num_tokens": 4380264.0,
      "reward": 0.6285499930381775,
      "reward_std": 0.2513039708137512,
      "rewards/reward_func/mean": 0.6285499930381775,
      "rewards/reward_func/std": 0.5067988634109497,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.9102950617671013,
      "epoch": 1.818359375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1660175621509552,
      "learning_rate": 1.0526315789473685e-07,
      "loss": 0.0,
      "num_tokens": 4384872.0,
      "reward": 0.8723000288009644,
      "reward_std": 0.24825221300125122,
      "rewards/reward_func/mean": 0.8723000288009644,
      "rewards/reward_func/std": 0.3525434732437134,
      "step": 931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.564951941370964,
      "epoch": 1.8203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23926304280757904,
      "learning_rate": 1.0000000000000001e-07,
      "loss": -0.0,
      "num_tokens": 4389696.0,
      "reward": 0.1210624948143959,
      "reward_std": 0.2421249896287918,
      "rewards/reward_func/mean": 0.1210624948143959,
      "rewards/reward_func/std": 0.33739402890205383,
      "step": 932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4994359500706196,
      "epoch": 1.822265625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25481706857681274,
      "learning_rate": 9.473684210526317e-08,
      "loss": 0.0,
      "num_tokens": 4394352.0,
      "reward": 0.629212498664856,
      "reward_std": 0.49713975191116333,
      "rewards/reward_func/mean": 0.629212498664856,
      "rewards/reward_func/std": 0.47718602418899536,
      "step": 933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4711499195545912,
      "epoch": 1.82421875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19426177442073822,
      "learning_rate": 8.947368421052632e-08,
      "loss": -0.0,
      "num_tokens": 4399000.0,
      "reward": 0.629349946975708,
      "reward_std": 0.5124804973602295,
      "rewards/reward_func/mean": 0.6293500065803528,
      "rewards/reward_func/std": 0.4940544664859772,
      "step": 934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8083347305655479,
      "epoch": 1.826171875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.17153169214725494,
      "learning_rate": 8.421052631578947e-08,
      "loss": 0.0,
      "num_tokens": 4403608.0,
      "reward": 0.7593749761581421,
      "reward_std": 0.27827125787734985,
      "rewards/reward_func/mean": 0.7593749761581421,
      "rewards/reward_func/std": 0.4460016191005707,
      "step": 935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6882787756621838,
      "epoch": 1.828125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26333948969841003,
      "learning_rate": 7.894736842105264e-08,
      "loss": 0.0,
      "num_tokens": 4408400.0,
      "reward": 0.5950000286102295,
      "reward_std": 0.5117709040641785,
      "rewards/reward_func/mean": 0.5950000286102295,
      "rewards/reward_func/std": 0.4931241571903229,
      "step": 936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8129791170358658,
      "epoch": 1.830078125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.368421052631579e-08,
      "loss": 0.0,
      "num_tokens": 4412896.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/reward_func/mean": 1.0,
      "rewards/reward_func/std": 0.0,
      "step": 937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.45138827711343765,
      "epoch": 1.83203125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.18060848116874695,
      "learning_rate": 6.842105263157895e-08,
      "loss": 0.0,
      "num_tokens": 4417680.0,
      "reward": 0.3540624976158142,
      "reward_std": 0.5053315162658691,
      "rewards/reward_func/mean": 0.3540624976158142,
      "rewards/reward_func/std": 0.4852207601070404,
      "step": 938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6235154792666435,
      "epoch": 1.833984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.16618505120277405,
      "learning_rate": 6.315789473684211e-08,
      "loss": -0.0,
      "num_tokens": 4422320.0,
      "reward": 0.7527874708175659,
      "reward_std": 0.4729631543159485,
      "rewards/reward_func/mean": 0.7527874708175659,
      "rewards/reward_func/std": 0.4385221302509308,
      "step": 939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.32566132955253124,
      "epoch": 1.8359375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1724817156791687,
      "learning_rate": 5.789473684210527e-08,
      "loss": 0.0,
      "num_tokens": 4426960.0,
      "reward": 0.38749998807907104,
      "reward_std": 0.5304675102233887,
      "rewards/reward_func/mean": 0.38750001788139343,
      "rewards/reward_func/std": 0.5074885487556458,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.3569133039563894,
      "epoch": 1.837890625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.17535947263240814,
      "learning_rate": 5.263157894736842e-08,
      "loss": 0.0,
      "num_tokens": 4431448.0,
      "reward": 0.6987625360488892,
      "reward_std": 0.4616749882698059,
      "rewards/reward_func/mean": 0.6987625360488892,
      "rewards/reward_func/std": 0.4274410605430603,
      "step": 941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5731943100690842,
      "epoch": 1.83984375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19585168361663818,
      "learning_rate": 4.7368421052631586e-08,
      "loss": 0.0,
      "num_tokens": 4436104.0,
      "reward": 0.3721874952316284,
      "reward_std": 0.5249817371368408,
      "rewards/reward_func/mean": 0.3721874952316284,
      "rewards/reward_func/std": 0.5038166046142578,
      "step": 942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.5804979503154755,
      "epoch": 1.841796875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15701980888843536,
      "learning_rate": 4.2105263157894737e-08,
      "loss": 0.0,
      "num_tokens": 4441096.0,
      "reward": 0.22850000858306885,
      "reward_std": 0.26406246423721313,
      "rewards/reward_func/mean": 0.22850000858306885,
      "rewards/reward_func/std": 0.4233279824256897,
      "step": 943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.8702232874929905,
      "epoch": 1.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30385154485702515,
      "learning_rate": 3.6842105263157894e-08,
      "loss": 0.0,
      "num_tokens": 4445896.0,
      "reward": 0.5129749774932861,
      "reward_std": 0.47286224365234375,
      "rewards/reward_func/mean": 0.5129749774932861,
      "rewards/reward_func/std": 0.5019313097000122,
      "step": 944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6899589747190475,
      "epoch": 1.845703125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23425976932048798,
      "learning_rate": 3.157894736842106e-08,
      "loss": 0.0,
      "num_tokens": 4450384.0,
      "reward": 0.8695999979972839,
      "reward_std": 0.25623539090156555,
      "rewards/reward_func/mean": 0.8695999979972839,
      "rewards/reward_func/std": 0.35150691866874695,
      "step": 945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.7242371030151844,
      "epoch": 1.84765625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.18696576356887817,
      "learning_rate": 2.631578947368421e-08,
      "loss": 0.0,
      "num_tokens": 4454728.0,
      "reward": 0.8765624761581421,
      "reward_std": 0.24687500298023224,
      "rewards/reward_func/mean": 0.8765624761581421,
      "rewards/reward_func/std": 0.34913399815559387,
      "step": 946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6600115075707436,
      "epoch": 1.849609375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26843783259391785,
      "learning_rate": 2.1052631578947368e-08,
      "loss": -0.0,
      "num_tokens": 4459744.0,
      "reward": 0.350600004196167,
      "reward_std": 0.4967568516731262,
      "rewards/reward_func/mean": 0.350600004196167,
      "rewards/reward_func/std": 0.4848525822162628,
      "step": 947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.6009881878271699,
      "epoch": 1.8515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2861350476741791,
      "learning_rate": 1.578947368421053e-08,
      "loss": -0.0,
      "num_tokens": 4464248.0,
      "reward": 0.5809999704360962,
      "reward_std": 0.5007524490356445,
      "rewards/reward_func/mean": 0.5809999704360962,
      "rewards/reward_func/std": 0.4811137020587921,
      "step": 948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.4016810180619359,
      "epoch": 1.853515625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.19072452187538147,
      "learning_rate": 1.0526315789473684e-08,
      "loss": -0.0,
      "num_tokens": 4468744.0,
      "reward": 0.5809999704360962,
      "reward_std": 0.5007524490356445,
      "rewards/reward_func/mean": 0.5809999704360962,
      "rewards/reward_func/std": 0.4811137020587921,
      "step": 949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 384.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.92277492582798,
      "epoch": 1.85546875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2709886431694031,
      "learning_rate": 5.263157894736842e-09,
      "loss": 0.0,
      "num_tokens": 4473248.0,
      "reward": 0.7473000288009644,
      "reward_std": 0.4982522130012512,
      "rewards/reward_func/mean": 0.7473000288009644,
      "rewards/reward_func/std": 0.4613037705421448,
      "step": 950
    }
  ],
  "logging_steps": 1,
  "max_steps": 950,
  "num_input_tokens_seen": 4473248,
  "num_train_epochs": 2,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}