{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.11048059056897504,
  "eval_steps": 500,
  "global_step": 1100,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 459.2,
      "completions/max_terminated_length": 272.7,
      "completions/mean_length": 76.24375,
      "completions/mean_terminated_length": 64.11458358764648,
      "completions/min_length": 16.8,
      "completions/min_terminated_length": 16.8,
      "epoch": 0.0010043690051725004,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.0,
      "learning_rate": 1.1999999999999998e-08,
      "loss": 0.0208,
      "num_tokens": 108131.0,
      "reward": 1.2312812566757203,
      "reward_std": 0.05931956073036417,
      "rewards/combined_reward/mean": 1.2312812566757203,
      "rewards/combined_reward/std": 0.4361365109682083,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01875,
      "completions/max_length": 330.9,
      "completions/max_terminated_length": 147.6,
      "completions/mean_length": 75.425,
      "completions/mean_terminated_length": 61.425418090820315,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.002008738010345001,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 2.5333333333333335e-08,
      "loss": 0.0279,
      "num_tokens": 233579.0,
      "reward": 1.3428645849227905,
      "reward_std": 0.029872814007103444,
      "rewards/combined_reward/mean": 1.3428645849227905,
      "rewards/combined_reward/std": 0.3860916443169117,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 110.6,
      "completions/max_terminated_length": 110.6,
      "completions/mean_length": 51.04375,
      "completions/mean_terminated_length": 51.04375,
      "completions/min_length": 16.7,
      "completions/min_terminated_length": 16.7,
      "epoch": 0.003013107015517501,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.3646392822265625,
      "learning_rate": 3.866666666666666e-08,
      "loss": -0.0132,
      "num_tokens": 352258.0,
      "reward": 1.323312509059906,
      "reward_std": 0.05337500050663948,
      "rewards/combined_reward/mean": 1.323312509059906,
      "rewards/combined_reward/std": 0.39539981335401536,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01875,
      "completions/max_length": 307.1,
      "completions/max_terminated_length": 211.8,
      "completions/mean_length": 95.83125,
      "completions/mean_terminated_length": 60.24375,
      "completions/min_length": 11.5,
      "completions/min_terminated_length": 11.5,
      "epoch": 0.004017476020690002,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0,
      "learning_rate": 5.2e-08,
      "loss": 0.0143,
      "num_tokens": 485155.0,
      "reward": 1.2628658890724183,
      "reward_std": 0.03280075653456151,
      "rewards/combined_reward/mean": 1.2628658890724183,
      "rewards/combined_reward/std": 0.4110621690750122,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.5,
      "completions/max_terminated_length": 140.5,
      "completions/mean_length": 61.7875,
      "completions/mean_terminated_length": 61.7875,
      "completions/min_length": 23.7,
      "completions/min_terminated_length": 23.7,
      "epoch": 0.005021845025862502,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 1.8525996208190918,
      "learning_rate": 6.533333333333332e-08,
      "loss": 0.0147,
      "num_tokens": 607629.0,
      "reward": 1.3795833349227906,
      "reward_std": 0.00583496168255806,
      "rewards/combined_reward/mean": 1.3795833349227906,
      "rewards/combined_reward/std": 0.30837071537971494,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 338.5,
      "completions/max_terminated_length": 238.4,
      "completions/mean_length": 102.60625,
      "completions/mean_terminated_length": 91.32791748046876,
      "completions/min_length": 21.6,
      "completions/min_terminated_length": 21.6,
      "epoch": 0.006026214031035002,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.992983818054199,
      "learning_rate": 7.866666666666666e-08,
      "loss": 0.0045,
      "num_tokens": 728802.0,
      "reward": 1.3164896011352538,
      "reward_std": 0.02619450243655592,
      "rewards/combined_reward/mean": 1.3164896011352538,
      "rewards/combined_reward/std": 0.3474510669708252,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 128.0,
      "completions/max_terminated_length": 128.0,
      "completions/mean_length": 62.81875,
      "completions/mean_terminated_length": 61.769583511352536,
      "completions/min_length": 20.2,
      "completions/min_terminated_length": 20.2,
      "epoch": 0.007030583036207502,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 9.2e-08,
      "loss": 0.0098,
      "num_tokens": 836341.0,
      "reward": 1.355798614025116,
      "reward_std": 0.004375000763684511,
      "rewards/combined_reward/mean": 1.355798614025116,
      "rewards/combined_reward/std": 0.29267608374357224,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 110.2,
      "completions/max_terminated_length": 110.2,
      "completions/mean_length": 54.3375,
      "completions/mean_terminated_length": 54.3375,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.008034952041380003,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 13.820528984069824,
      "learning_rate": 1.0533333333333332e-07,
      "loss": 0.0119,
      "num_tokens": 945703.0,
      "reward": 1.4564843893051147,
      "reward_std": 0.003906251955777406,
      "rewards/combined_reward/mean": 1.4564843893051147,
      "rewards/combined_reward/std": 0.1776508768554777,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 201.9,
      "completions/max_terminated_length": 201.9,
      "completions/mean_length": 70.6125,
      "completions/mean_terminated_length": 70.6125,
      "completions/min_length": 21.5,
      "completions/min_terminated_length": 21.5,
      "epoch": 0.009039321046552503,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.1866666666666667e-07,
      "loss": 0.0195,
      "num_tokens": 1062961.0,
      "reward": 1.3238854348659514,
      "reward_std": 0.005562501423992216,
      "rewards/combined_reward/mean": 1.3238854348659514,
      "rewards/combined_reward/std": 0.22054901346564293,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 113.7,
      "completions/max_terminated_length": 113.7,
      "completions/mean_length": 60.275,
      "completions/mean_terminated_length": 60.275,
      "completions/min_length": 24.1,
      "completions/min_terminated_length": 24.1,
      "epoch": 0.010043690051725004,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.0,
      "learning_rate": 1.32e-07,
      "loss": 0.0058,
      "num_tokens": 1175365.0,
      "reward": 1.4070937514305115,
      "reward_std": 0.034517763555049895,
      "rewards/combined_reward/mean": 1.4070937514305115,
      "rewards/combined_reward/std": 0.26661672741174697,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 158.5,
      "completions/max_terminated_length": 158.5,
      "completions/mean_length": 65.46875,
      "completions/mean_terminated_length": 64.41750030517578,
      "completions/min_length": 19.1,
      "completions/min_terminated_length": 19.1,
      "epoch": 0.011048059056897505,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.4533333333333334e-07,
      "loss": 0.0019,
      "num_tokens": 1288772.0,
      "reward": 1.2793750286102294,
      "reward_std": 0.0024999996647238733,
      "rewards/combined_reward/mean": 1.2793750286102294,
      "rewards/combined_reward/std": 0.31086390763521193,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 322.2,
      "completions/max_terminated_length": 134.1,
      "completions/mean_length": 77.01875,
      "completions/mean_terminated_length": 64.77458343505859,
      "completions/min_length": 20.7,
      "completions/min_terminated_length": 20.7,
      "epoch": 0.012052428062070004,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 8.276171684265137,
      "learning_rate": 1.5866666666666666e-07,
      "loss": 0.0134,
      "num_tokens": 1403035.0,
      "reward": 1.3504362106323242,
      "reward_std": 0.030459362699184568,
      "rewards/combined_reward/mean": 1.3504362106323242,
      "rewards/combined_reward/std": 0.309928272664547,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 102.1,
      "completions/max_terminated_length": 102.1,
      "completions/mean_length": 61.0625,
      "completions/mean_terminated_length": 61.0625,
      "completions/min_length": 31.5,
      "completions/min_terminated_length": 31.5,
      "epoch": 0.013056797067242505,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.7199999999999998e-07,
      "loss": -0.0027,
      "num_tokens": 1524697.0,
      "reward": 1.361527794599533,
      "reward_std": 0.008749999664723873,
      "rewards/combined_reward/mean": 1.361527794599533,
      "rewards/combined_reward/std": 0.2736371263861656,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.4,
      "completions/max_terminated_length": 123.4,
      "completions/mean_length": 58.05,
      "completions/mean_terminated_length": 58.05,
      "completions/min_length": 16.8,
      "completions/min_terminated_length": 16.8,
      "epoch": 0.014061166072415004,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 3.9411776065826416,
      "learning_rate": 1.8533333333333333e-07,
      "loss": 0.0062,
      "num_tokens": 1622389.0,
      "reward": 1.3123229265213012,
      "reward_std": 0.03212499991059303,
      "rewards/combined_reward/mean": 1.3123229265213012,
      "rewards/combined_reward/std": 0.35334871551021935,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 333.1,
      "completions/max_terminated_length": 135.6,
      "completions/mean_length": 111.125,
      "completions/mean_terminated_length": 61.191666793823245,
      "completions/min_length": 21.8,
      "completions/min_terminated_length": 21.8,
      "epoch": 0.015065535077587506,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9866666666666665e-07,
      "loss": 0.0039,
      "num_tokens": 1734901.0,
      "reward": 1.2678720355033875,
      "reward_std": 0.0006250014062970877,
      "rewards/combined_reward/mean": 1.2678720355033875,
      "rewards/combined_reward/std": 0.2531693406403065,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 85.7,
      "completions/max_terminated_length": 85.7,
      "completions/mean_length": 48.81875,
      "completions/mean_terminated_length": 48.81875,
      "completions/min_length": 17.9,
      "completions/min_terminated_length": 17.9,
      "epoch": 0.016069904082760007,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9999507890797406e-07,
      "loss": 0.0046,
      "num_tokens": 1847536.0,
      "reward": 1.345395851135254,
      "reward_std": 0.0016666671261191368,
      "rewards/combined_reward/mean": 1.345395851135254,
      "rewards/combined_reward/std": 0.29257251909002663,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 467.7,
      "completions/max_terminated_length": 277.7,
      "completions/mean_length": 144.23125,
      "completions/mean_terminated_length": 95.81041717529297,
      "completions/min_length": 28.9,
      "completions/min_terminated_length": 28.9,
      "epoch": 0.017074273087932506,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.9997806834748455e-07,
      "loss": -0.0018,
      "num_tokens": 1970837.0,
      "reward": 1.3027083039283753,
      "reward_std": 0.004424501396715641,
      "rewards/combined_reward/mean": 1.3027083039283753,
      "rewards/combined_reward/std": 0.4294335596263409,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 99.5,
      "completions/max_terminated_length": 99.5,
      "completions/mean_length": 50.44375,
      "completions/mean_terminated_length": 50.44375,
      "completions/min_length": 14.6,
      "completions/min_terminated_length": 14.6,
      "epoch": 0.018078642093105005,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9994890963073946e-07,
      "loss": 0.0059,
      "num_tokens": 2088820.0,
      "reward": 1.2765364408493043,
      "reward_std": 0.00015624959487468005,
      "rewards/combined_reward/mean": 1.2765364408493043,
      "rewards/combined_reward/std": 0.3481216669082642,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 170.0,
      "completions/max_terminated_length": 170.0,
      "completions/mean_length": 67.0,
      "completions/mean_terminated_length": 67.0,
      "completions/min_length": 15.8,
      "completions/min_terminated_length": 15.8,
      "epoch": 0.019083011098277508,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.9990760630076236e-07,
      "loss": -0.0197,
      "num_tokens": 2217116.0,
      "reward": 1.3771250247955322,
      "reward_std": 0.001916667865589261,
      "rewards/combined_reward/mean": 1.3771250247955322,
      "rewards/combined_reward/std": 0.29997652024030685,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 99.2,
      "completions/max_terminated_length": 99.2,
      "completions/mean_length": 41.91875,
      "completions/mean_terminated_length": 41.91875,
      "completions/min_length": 12.8,
      "completions/min_terminated_length": 12.8,
      "epoch": 0.020087380103450007,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 3.989150047302246,
      "learning_rate": 1.99854163376247e-07,
      "loss": 0.0011,
      "num_tokens": 2329863.0,
      "reward": 1.1117187559604644,
      "reward_std": 0.02916821506805718,
      "rewards/combined_reward/mean": 1.1117187559604644,
      "rewards/combined_reward/std": 0.37413454949855807,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 408.0,
      "completions/max_terminated_length": 220.7,
      "completions/mean_length": 133.575,
      "completions/mean_terminated_length": 84.2875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.021091749108622507,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0,
      "learning_rate": 1.9978858735094754e-07,
      "loss": 0.0285,
      "num_tokens": 2457743.0,
      "reward": 1.3693958520889282,
      "reward_std": 0.004563984216656536,
      "rewards/combined_reward/mean": 1.3693958520889282,
      "rewards/combined_reward/std": 0.33579447590745987,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 115.4,
      "completions/max_terminated_length": 115.4,
      "completions/mean_length": 60.24375,
      "completions/mean_terminated_length": 60.24375,
      "completions/min_length": 20.8,
      "completions/min_terminated_length": 20.8,
      "epoch": 0.02209611811379501,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.9971088619288948e-07,
      "loss": 0.0,
      "num_tokens": 2581282.0,
      "reward": 1.284375011920929,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.284375011920929,
      "rewards/combined_reward/std": 0.3291483834385872,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 110.9,
      "completions/max_terminated_length": 110.9,
      "completions/mean_length": 52.08125,
      "completions/mean_terminated_length": 51.73625030517578,
      "completions/min_length": 15.5,
      "completions/min_terminated_length": 15.5,
      "epoch": 0.02310048711896751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.996210693434016e-07,
      "loss": 0.0,
      "num_tokens": 2716695.0,
      "reward": 1.3078229188919068,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3078229188919068,
      "rewards/combined_reward/std": 0.3146174341440201,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01875,
      "completions/max_length": 316.8,
      "completions/max_terminated_length": 296.7,
      "completions/mean_length": 106.325,
      "completions/mean_terminated_length": 71.55961608886719,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.024104856124140008,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.9951914771596858e-07,
      "loss": 0.0,
      "num_tokens": 2820347.0,
      "reward": 1.2994583308696748,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2994583308696748,
      "rewards/combined_reward/std": 0.35011555850505827,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 105.0,
      "completions/max_terminated_length": 105.0,
      "completions/mean_length": 58.80625,
      "completions/mean_terminated_length": 57.67589340209961,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.02510922512931251,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.9940513369490513e-07,
      "loss": 0.0119,
      "num_tokens": 2937640.0,
      "reward": 1.2942708253860473,
      "reward_std": 0.0020473659737035633,
      "rewards/combined_reward/mean": 1.2942708253860473,
      "rewards/combined_reward/std": 0.34473495446145536,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.2,
      "completions/max_terminated_length": 136.2,
      "completions/mean_length": 68.56875,
      "completions/mean_terminated_length": 68.56875,
      "completions/min_length": 27.3,
      "completions/min_terminated_length": 27.3,
      "epoch": 0.02611359413448501,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.9927904113385096e-07,
      "loss": 0.0134,
      "num_tokens": 3051799.0,
      "reward": 1.3380468726158141,
      "reward_std": 0.00270459558814764,
      "rewards/combined_reward/mean": 1.3380468726158141,
      "rewards/combined_reward/std": 0.28382683396339414,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.2,
      "completions/max_terminated_length": 172.2,
      "completions/mean_length": 72.875,
      "completions/mean_terminated_length": 72.875,
      "completions/min_length": 26.4,
      "completions/min_terminated_length": 26.4,
      "epoch": 0.02711796313965751,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.9914088535408765e-07,
      "loss": -0.0019,
      "num_tokens": 3164803.0,
      "reward": 1.4464478969573975,
      "reward_std": 0.0021736113354563712,
      "rewards/combined_reward/mean": 1.4464478969573975,
      "rewards/combined_reward/std": 0.19929498732089995,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.5,
      "completions/max_terminated_length": 140.5,
      "completions/mean_length": 59.38125,
      "completions/mean_terminated_length": 59.38125,
      "completions/min_length": 15.9,
      "completions/min_terminated_length": 15.9,
      "epoch": 0.02812233214483001,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9899068314267685e-07,
      "loss": 0.001,
      "num_tokens": 3280220.0,
      "reward": 1.3454687356948853,
      "reward_std": 0.004999999329447747,
      "rewards/combined_reward/mean": 1.3454687356948853,
      "rewards/combined_reward/std": 0.31286893486976625,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 94.7,
      "completions/max_terminated_length": 94.7,
      "completions/mean_length": 55.0,
      "completions/mean_terminated_length": 55.0,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.029126701150002512,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.9882845275042067e-07,
      "loss": 0.0065,
      "num_tokens": 3385228.0,
      "reward": 1.4142057299613953,
      "reward_std": 0.00044270951766520736,
      "rewards/combined_reward/mean": 1.4142057299613953,
      "rewards/combined_reward/std": 0.20944447480142117,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 173.3,
      "completions/max_terminated_length": 173.3,
      "completions/mean_length": 76.13125,
      "completions/mean_terminated_length": 76.13125,
      "completions/min_length": 23.4,
      "completions/min_terminated_length": 23.4,
      "epoch": 0.03013107015517501,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9865421388964382e-07,
      "loss": -0.0017,
      "num_tokens": 3496189.0,
      "reward": 1.3910624980926514,
      "reward_std": 0.0021650632843375206,
      "rewards/combined_reward/mean": 1.3910624980926514,
      "rewards/combined_reward/std": 0.28597628474235537,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 315.4,
      "completions/max_terminated_length": 315.4,
      "completions/mean_length": 99.93125,
      "completions/mean_terminated_length": 99.93125,
      "completions/min_length": 18.4,
      "completions/min_terminated_length": 18.4,
      "epoch": 0.03113543916034751,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 3.8047702312469482,
      "learning_rate": 1.9846798773179865e-07,
      "loss": 0.0118,
      "num_tokens": 3602282.0,
      "reward": 1.2963680744171142,
      "reward_std": 0.01609460562467575,
      "rewards/combined_reward/mean": 1.2963680744171142,
      "rewards/combined_reward/std": 0.3926819786429405,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 103.8,
      "completions/max_terminated_length": 103.8,
      "completions/mean_length": 52.2875,
      "completions/mean_terminated_length": 52.2875,
      "completions/min_length": 20.6,
      "completions/min_terminated_length": 20.6,
      "epoch": 0.03213980816552001,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9826979690489249e-07,
      "loss": 0.0014,
      "num_tokens": 3717904.0,
      "reward": 1.403697907924652,
      "reward_std": 0.0003125001909211278,
      "rewards/combined_reward/mean": 1.403697907924652,
      "rewards/combined_reward/std": 0.24410614371299744,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 79.8,
      "completions/max_terminated_length": 79.8,
      "completions/mean_length": 44.49375,
      "completions/mean_terminated_length": 44.49375,
      "completions/min_length": 16.1,
      "completions/min_terminated_length": 16.1,
      "epoch": 0.03314417717069251,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0,
      "learning_rate": 1.9805966549073822e-07,
      "loss": 0.0057,
      "num_tokens": 3825867.0,
      "reward": 1.3135937452316284,
      "reward_std": 0.007812501117587089,
      "rewards/combined_reward/mean": 1.3135937452316284,
      "rewards/combined_reward/std": 0.3756252348423004,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 117.9,
      "completions/max_terminated_length": 117.9,
      "completions/mean_length": 54.15,
      "completions/mean_terminated_length": 54.15,
      "completions/min_length": 15.8,
      "completions/min_terminated_length": 15.8,
      "epoch": 0.03414854617586501,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9783761902202812e-07,
      "loss": 0.0067,
      "num_tokens": 3942087.0,
      "reward": 1.290208351612091,
      "reward_std": 0.0010206203907728196,
      "rewards/combined_reward/mean": 1.290208351612091,
      "rewards/combined_reward/std": 0.27491325289011004,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 89.2,
      "completions/max_terminated_length": 89.2,
      "completions/mean_length": 45.46875,
      "completions/mean_terminated_length": 45.46875,
      "completions/min_length": 12.9,
      "completions/min_terminated_length": 12.9,
      "epoch": 0.03515291518103751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.9760368447923143e-07,
      "loss": 0.0,
      "num_tokens": 4077218.0,
      "reward": 1.271875011920929,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.271875011920929,
      "rewards/combined_reward/std": 0.3903637401759624,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 86.1,
      "completions/max_terminated_length": 86.1,
      "completions/mean_length": 47.9125,
      "completions/mean_terminated_length": 47.9125,
      "completions/min_length": 19.1,
      "completions/min_terminated_length": 19.1,
      "epoch": 0.03615728418621001,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 5.847682952880859,
      "learning_rate": 1.9735789028731602e-07,
      "loss": -0.0023,
      "num_tokens": 4189144.0,
      "reward": 1.3238541960716248,
      "reward_std": 0.03020833432674408,
      "rewards/combined_reward/mean": 1.3238541960716248,
      "rewards/combined_reward/std": 0.32445888966321945,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 103.3,
      "completions/max_terminated_length": 103.3,
      "completions/mean_length": 55.5,
      "completions/mean_terminated_length": 55.5,
      "completions/min_length": 20.6,
      "completions/min_terminated_length": 20.6,
      "epoch": 0.03716165319138252,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.9710026631229448e-07,
      "loss": 0.0001,
      "num_tokens": 4294100.0,
      "reward": 1.3909027934074403,
      "reward_std": 0.00034722290001809597,
      "rewards/combined_reward/mean": 1.3909027934074403,
      "rewards/combined_reward/std": 0.2816110193729401,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 119.4,
      "completions/max_terminated_length": 119.4,
      "completions/mean_length": 57.65625,
      "completions/mean_terminated_length": 57.65625,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.038166022196555016,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9683084385759522e-07,
      "loss": -0.0002,
      "num_tokens": 4400477.0,
      "reward": 1.333958351612091,
      "reward_std": 0.0012500007636845113,
      "rewards/combined_reward/mean": 1.333958351612091,
      "rewards/combined_reward/std": 0.2801030218601227,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 112.6,
      "completions/max_terminated_length": 112.6,
      "completions/mean_length": 55.225,
      "completions/mean_terminated_length": 55.225,
      "completions/min_length": 18.2,
      "completions/min_terminated_length": 18.2,
      "epoch": 0.039170391201727515,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 4.480510234832764,
      "learning_rate": 1.9654965566025878e-07,
      "loss": 0.006,
      "num_tokens": 4516865.0,
      "reward": 1.370369803905487,
      "reward_std": 0.002187502384185791,
      "rewards/combined_reward/mean": 1.370369803905487,
      "rewards/combined_reward/std": 0.27093904092907906,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 139.2,
      "completions/max_terminated_length": 139.2,
      "completions/mean_length": 55.54375,
      "completions/mean_terminated_length": 55.54375,
      "completions/min_length": 12.2,
      "completions/min_terminated_length": 12.2,
      "epoch": 0.040174760206900015,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.9625673588696007e-07,
      "loss": 0.0,
      "num_tokens": 4634776.0,
      "reward": 1.2619999647140503,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2619999647140503,
      "rewards/combined_reward/std": 0.3673270642757416,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 106.5,
      "completions/max_terminated_length": 106.5,
      "completions/mean_length": 52.2875,
      "completions/mean_terminated_length": 52.2875,
      "completions/min_length": 13.1,
      "completions/min_terminated_length": 13.1,
      "epoch": 0.041179129212072514,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 5.624104022979736,
      "learning_rate": 1.959521201298568e-07,
      "loss": 0.0061,
      "num_tokens": 4766894.0,
      "reward": 1.3308506846427917,
      "reward_std": 0.003342500701546669,
      "rewards/combined_reward/mean": 1.3308506846427917,
      "rewards/combined_reward/std": 0.37019643262028695,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.9,
      "completions/max_terminated_length": 144.9,
      "completions/mean_length": 63.63125,
      "completions/mean_terminated_length": 63.63125,
      "completions/min_length": 18.3,
      "completions/min_terminated_length": 18.3,
      "epoch": 0.042183498217245013,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.956358454022648e-07,
      "loss": -0.0011,
      "num_tokens": 4887883.0,
      "reward": 1.3249478936195374,
      "reward_std": 0.016550703253597022,
      "rewards/combined_reward/mean": 1.3249478936195374,
      "rewards/combined_reward/std": 0.31248683035373687,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 70.8,
      "completions/max_terminated_length": 70.8,
      "completions/mean_length": 40.03125,
      "completions/mean_terminated_length": 40.03125,
      "completions/min_length": 21.5,
      "completions/min_terminated_length": 21.5,
      "epoch": 0.04318786722241751,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9530795013416043e-07,
      "loss": -0.0062,
      "num_tokens": 5017432.0,
      "reward": 1.2040624856948852,
      "reward_std": 0.003125,
      "rewards/combined_reward/mean": 1.2040624856948852,
      "rewards/combined_reward/std": 0.28724531903862954,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0125,
      "completions/max_length": 95.9,
      "completions/max_terminated_length": 95.9,
      "completions/mean_length": 47.64375,
      "completions/mean_terminated_length": 46.64416732788086,
      "completions/min_length": 14.4,
      "completions/min_terminated_length": 14.4,
      "epoch": 0.04419223622759002,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.9496847416751122e-07,
      "loss": -0.0055,
      "num_tokens": 5127539.0,
      "reward": 1.3247395992279052,
      "reward_std": 0.005520834401249885,
      "rewards/combined_reward/mean": 1.3247395992279052,
      "rewards/combined_reward/std": 0.353334778547287,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 101.6,
      "completions/max_terminated_length": 101.6,
      "completions/mean_length": 53.95625,
      "completions/mean_terminated_length": 53.95625,
      "completions/min_length": 21.6,
      "completions/min_terminated_length": 21.6,
      "epoch": 0.04519660523276252,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9461745875143477e-07,
      "loss": -0.0013,
      "num_tokens": 5239592.0,
      "reward": 1.2362499833106995,
      "reward_std": 0.0016666660085320473,
      "rewards/combined_reward/mean": 1.2362499833106995,
      "rewards/combined_reward/std": 0.33721971064805983,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 156.9,
      "completions/max_terminated_length": 156.9,
      "completions/mean_length": 73.56875,
      "completions/mean_terminated_length": 73.56875,
      "completions/min_length": 16.6,
      "completions/min_terminated_length": 16.6,
      "epoch": 0.04620097423793502,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.942549465371863e-07,
      "loss": -0.0051,
      "num_tokens": 5360759.0,
      "reward": 1.364300584793091,
      "reward_std": 0.0033333331346511843,
      "rewards/combined_reward/mean": 1.364300584793091,
      "rewards/combined_reward/std": 0.29198225438594816,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 89.4,
      "completions/max_terminated_length": 89.4,
      "completions/mean_length": 49.9,
      "completions/mean_terminated_length": 49.9,
      "completions/min_length": 14.5,
      "completions/min_terminated_length": 14.5,
      "epoch": 0.04720534324310752,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.938809815729766e-07,
      "loss": 0.0,
      "num_tokens": 5489735.0,
      "reward": 1.2914583563804627,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2914583563804627,
      "rewards/combined_reward/std": 0.32128691375255586,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 116.8,
      "completions/max_terminated_length": 116.8,
      "completions/mean_length": 54.26875,
      "completions/mean_terminated_length": 54.26875,
      "completions/min_length": 16.8,
      "completions/min_terminated_length": 16.8,
      "epoch": 0.048209712248280016,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.9349560929861957e-07,
      "loss": 0.0036,
      "num_tokens": 5618126.0,
      "reward": 1.2964062452316285,
      "reward_std": 0.0034375011920928953,
      "rewards/combined_reward/mean": 1.2964062452316285,
      "rewards/combined_reward/std": 0.3410232897847891,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.5,
      "completions/max_terminated_length": 138.5,
      "completions/mean_length": 63.425,
      "completions/mean_terminated_length": 63.425,
      "completions/min_length": 17.2,
      "completions/min_terminated_length": 17.2,
      "epoch": 0.049214081253452516,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 5.859716892242432,
      "learning_rate": 1.9309887654001093e-07,
      "loss": -0.0122,
      "num_tokens": 5732858.0,
      "reward": 1.3710416555404663,
      "reward_std": 0.005000000074505806,
      "rewards/combined_reward/mean": 1.3710416555404663,
      "rewards/combined_reward/std": 0.2569635409861803,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 71.1,
      "completions/max_terminated_length": 71.1,
      "completions/mean_length": 37.5125,
      "completions/mean_terminated_length": 37.5125,
      "completions/min_length": 15.6,
      "completions/min_terminated_length": 15.6,
      "epoch": 0.05021845025862502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.9269083150343857e-07,
      "loss": 0.0,
      "num_tokens": 5827508.0,
      "reward": 1.2737499952316285,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2737499952316285,
      "rewards/combined_reward/std": 0.36351585388183594,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 92.3,
      "completions/max_terminated_length": 92.3,
      "completions/mean_length": 49.31875,
      "completions/mean_terminated_length": 49.31875,
      "completions/min_length": 16.5,
      "completions/min_terminated_length": 16.5,
      "epoch": 0.05122281926379752,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.9227152376972505e-07,
      "loss": 0.0,
      "num_tokens": 5940043.0,
      "reward": 1.3223958492279053,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3223958492279053,
      "rewards/combined_reward/std": 0.32680114805698396,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 112.1,
      "completions/max_terminated_length": 112.1,
      "completions/mean_length": 60.84375,
      "completions/mean_terminated_length": 60.84375,
      "completions/min_length": 22.7,
      "completions/min_terminated_length": 22.7,
      "epoch": 0.05222718826897002,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.91841004288203e-07,
      "loss": 0.0,
      "num_tokens": 6061038.0,
      "reward": 1.3749479293823241,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3749479293823241,
      "rewards/combined_reward/std": 0.2760587348602712,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 383.1,
      "completions/max_terminated_length": 211.9,
      "completions/mean_length": 101.45,
      "completions/mean_terminated_length": 89.37000045776367,
      "completions/min_length": 29.4,
      "completions/min_terminated_length": 29.4,
      "epoch": 0.05323155727414252,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.0,
      "learning_rate": 1.913993253705246e-07,
      "loss": 0.0182,
      "num_tokens": 6172502.0,
      "reward": 1.3482013940811157,
      "reward_std": 0.004686582600697875,
      "rewards/combined_reward/mean": 1.3482013940811157,
      "rewards/combined_reward/std": 0.26615125834941866,
      "step": 530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 116.0,
      "completions/max_terminated_length": 116.0,
      "completions/mean_length": 61.33125,
      "completions/mean_terminated_length": 61.33125,
      "completions/min_length": 26.7,
      "completions/min_terminated_length": 26.7,
      "epoch": 0.05423592627931502,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 6.519238471984863,
      "learning_rate": 1.9094654068430515e-07,
      "loss": -0.014,
      "num_tokens": 6279539.0,
      "reward": 1.456402564048767,
      "reward_std": 0.0006212619598954916,
      "rewards/combined_reward/mean": 1.456402564048767,
      "rewards/combined_reward/std": 0.17502975650131702,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 319.8,
      "completions/max_terminated_length": 302.8,
      "completions/mean_length": 102.7,
      "completions/mean_terminated_length": 92.22833557128907,
      "completions/min_length": 14.5,
      "completions/min_terminated_length": 14.5,
      "epoch": 0.05524029528448752,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 6.630038738250732,
      "learning_rate": 1.9048270524660196e-07,
      "loss": 0.0001,
      "num_tokens": 6401355.0,
      "reward": 1.2464791774749755,
      "reward_std": 0.016750000603497028,
      "rewards/combined_reward/mean": 1.2464791774749755,
      "rewards/combined_reward/std": 0.43877428472042085,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 108.4,
      "completions/max_terminated_length": 108.4,
      "completions/mean_length": 57.21875,
      "completions/mean_terminated_length": 57.21875,
      "completions/min_length": 22.1,
      "completions/min_terminated_length": 22.1,
      "epoch": 0.05624466428966002,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 4.464468955993652,
      "learning_rate": 1.9000787541722936e-07,
      "loss": -0.0008,
      "num_tokens": 6512806.0,
      "reward": 1.3637500047683715,
      "reward_std": 0.0056250004563480616,
      "rewards/combined_reward/mean": 1.3637500047683715,
      "rewards/combined_reward/std": 0.25516389338299633,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00625,
      "completions/max_length": 296.2,
      "completions/max_terminated_length": 201.7,
      "completions/mean_length": 86.75625,
      "completions/mean_terminated_length": 75.22125091552735,
      "completions/min_length": 24.9,
      "completions/min_terminated_length": 24.9,
      "epoch": 0.057249033294832524,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.8952210889191065e-07,
      "loss": -0.0016,
      "num_tokens": 6619515.0,
      "reward": 1.3538541674613953,
      "reward_std": 0.009270833618938924,
      "rewards/combined_reward/mean": 1.3538541674613953,
      "rewards/combined_reward/std": 0.35525577939115466,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 104.1,
      "completions/max_terminated_length": 104.1,
      "completions/mean_length": 48.9,
      "completions/mean_terminated_length": 48.9,
      "completions/min_length": 16.2,
      "completions/min_terminated_length": 16.2,
      "epoch": 0.058253402300005024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.890254646952674e-07,
      "loss": 0.0,
      "num_tokens": 6728163.0,
      "reward": 1.2268749833106996,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2268749833106996,
      "rewards/combined_reward/std": 0.33372554890811446,
      "step": 580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 115.7,
      "completions/max_terminated_length": 115.7,
      "completions/mean_length": 61.34375,
      "completions/mean_terminated_length": 61.34375,
      "completions/min_length": 20.3,
      "completions/min_terminated_length": 20.3,
      "epoch": 0.05925777130517752,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 3.076678991317749,
      "learning_rate": 1.885180031736477e-07,
      "loss": -0.0013,
      "num_tokens": 6845358.0,
      "reward": 1.3715885639190675,
      "reward_std": 0.0037068985402584076,
      "rewards/combined_reward/mean": 1.3715885639190675,
      "rewards/combined_reward/std": 0.3188589945435524,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 122.6,
      "completions/max_terminated_length": 122.6,
      "completions/mean_length": 55.81875,
      "completions/mean_terminated_length": 55.81875,
      "completions/min_length": 14.8,
      "completions/min_terminated_length": 14.8,
      "epoch": 0.06026214031035002,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.879997859877932e-07,
      "loss": 0.0032,
      "num_tokens": 6971649.0,
      "reward": 1.280833327770233,
      "reward_std": 0.0006132050417363644,
      "rewards/combined_reward/mean": 1.280833327770233,
      "rewards/combined_reward/std": 0.338599956035614,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 387.6,
      "completions/max_terminated_length": 192.1,
      "completions/mean_length": 122.46875,
      "completions/mean_terminated_length": 72.42708358764648,
      "completions/min_length": 23.3,
      "completions/min_terminated_length": 23.3,
      "epoch": 0.06126650931552252,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.8747087610534734e-07,
      "loss": 0.019,
      "num_tokens": 7087600.0,
      "reward": 1.338072907924652,
      "reward_std": 0.013132144883275031,
      "rewards/combined_reward/mean": 1.338072907924652,
      "rewards/combined_reward/std": 0.30777021273970606,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 112.7,
      "completions/max_terminated_length": 112.7,
      "completions/mean_length": 58.44375,
      "completions/mean_terminated_length": 58.44375,
      "completions/min_length": 23.9,
      "completions/min_terminated_length": 23.9,
      "epoch": 0.06227087832069502,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.8693133779320382e-07,
      "loss": -0.0031,
      "num_tokens": 7191467.0,
      "reward": 1.3348880290985108,
      "reward_std": 0.007124999910593033,
      "rewards/combined_reward/mean": 1.3348880290985108,
      "rewards/combined_reward/std": 0.2751554258167744,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.8,
      "completions/max_terminated_length": 194.8,
      "completions/mean_length": 84.76875,
      "completions/mean_terminated_length": 84.76875,
      "completions/min_length": 21.4,
      "completions/min_terminated_length": 21.4,
      "epoch": 0.06327524732586752,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.8638123660969793e-07,
      "loss": -0.0084,
      "num_tokens": 7304146.0,
      "reward": 1.3757467865943909,
      "reward_std": 0.0030034731142222883,
      "rewards/combined_reward/mean": 1.3757467865943909,
      "rewards/combined_reward/std": 0.28882216811180117,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 101.9,
      "completions/max_terminated_length": 101.9,
      "completions/mean_length": 56.925,
      "completions/mean_terminated_length": 56.925,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.06427961633104003,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.858206393966405e-07,
      "loss": 0.0,
      "num_tokens": 7415006.0,
      "reward": 1.3215104341506958,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3215104341506958,
      "rewards/combined_reward/std": 0.33309968262910844,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 106.5,
      "completions/max_terminated_length": 106.5,
      "completions/mean_length": 58.26875,
      "completions/mean_terminated_length": 58.26875,
      "completions/min_length": 14.2,
      "completions/min_terminated_length": 14.2,
      "epoch": 0.06528398533621252,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.8524961427119615e-07,
      "loss": -0.009,
      "num_tokens": 7546381.0,
      "reward": 1.3129427313804627,
      "reward_std": 0.002951054647564888,
      "rewards/combined_reward/mean": 1.3129427313804627,
      "rewards/combined_reward/std": 0.3575292468070984,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 84.0,
      "completions/max_terminated_length": 84.0,
      "completions/mean_length": 46.75625,
      "completions/mean_terminated_length": 46.75625,
      "completions/min_length": 15.4,
      "completions/min_terminated_length": 15.4,
      "epoch": 0.06628835434138503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.846682306176065e-07,
      "loss": 0.0,
      "num_tokens": 7668158.0,
      "reward": 1.3184374928474427,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3184374928474427,
      "rewards/combined_reward/std": 0.35122168958187105,
      "step": 660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 101.7,
      "completions/max_terminated_length": 101.7,
      "completions/mean_length": 56.3375,
      "completions/mean_terminated_length": 56.3375,
      "completions/min_length": 17.9,
      "completions/min_terminated_length": 17.9,
      "epoch": 0.06729272334655753,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.8407655907875938e-07,
      "loss": 0.0006,
      "num_tokens": 7794644.0,
      "reward": 1.331454861164093,
      "reward_std": 0.007124999910593033,
      "rewards/combined_reward/mean": 1.331454861164093,
      "rewards/combined_reward/std": 0.3434182394295931,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.7,
      "completions/max_terminated_length": 135.7,
      "completions/mean_length": 68.90625,
      "completions/mean_terminated_length": 68.90625,
      "completions/min_length": 16.1,
      "completions/min_terminated_length": 16.1,
      "epoch": 0.06829709235173002,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0,
      "learning_rate": 1.8347467154760515e-07,
      "loss": 0.0079,
      "num_tokens": 7913933.0,
      "reward": 1.3356944441795349,
      "reward_std": 0.0053335148841142654,
      "rewards/combined_reward/mean": 1.3356944441795349,
      "rewards/combined_reward/std": 0.3590264985337853,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 265.9,
      "completions/max_terminated_length": 265.9,
      "completions/mean_length": 91.5,
      "completions/mean_terminated_length": 91.5,
      "completions/min_length": 24.8,
      "completions/min_terminated_length": 24.8,
      "epoch": 0.06930146135690253,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.8286264115842114e-07,
      "loss": 0.0017,
      "num_tokens": 8033153.0,
      "reward": 1.3431249916553498,
      "reward_std": 0.0044791650027036665,
      "rewards/combined_reward/mean": 1.3431249916553498,
      "rewards/combined_reward/std": 0.3242304854094982,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 73.2,
      "completions/max_terminated_length": 73.2,
      "completions/mean_length": 39.55625,
      "completions/mean_terminated_length": 39.55625,
      "completions/min_length": 17.7,
      "completions/min_terminated_length": 17.7,
      "epoch": 0.07030583036207502,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.8224054227792522e-07,
      "loss": -0.003,
      "num_tokens": 8147198.0,
      "reward": 1.3440885424613953,
      "reward_std": 0.0002604176523163915,
      "rewards/combined_reward/mean": 1.3440885424613953,
      "rewards/combined_reward/std": 0.3006736177019775,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.6,
      "completions/max_terminated_length": 123.6,
      "completions/mean_length": 67.76875,
      "completions/mean_terminated_length": 67.76875,
      "completions/min_length": 25.8,
      "completions/min_terminated_length": 25.8,
      "epoch": 0.07131019936724753,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 1.436936616897583,
      "learning_rate": 1.816084504962396e-07,
      "loss": 0.0009,
      "num_tokens": 8248985.0,
      "reward": 1.459496557712555,
      "reward_std": 0.002500000596046448,
      "rewards/combined_reward/mean": 1.459496557712555,
      "rewards/combined_reward/std": 0.15663873171433806,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.1,
      "completions/max_terminated_length": 172.1,
      "completions/mean_length": 76.96875,
      "completions/mean_terminated_length": 76.96875,
      "completions/min_length": 24.5,
      "completions/min_terminated_length": 24.5,
      "epoch": 0.07231456837242002,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.8096644261770608e-07,
      "loss": 0.0179,
      "num_tokens": 8373128.0,
      "reward": 1.3943750143051148,
      "reward_std": 0.005624998733401299,
      "rewards/combined_reward/mean": 1.3943750143051148,
      "rewards/combined_reward/std": 0.24296645894646646,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 120.0,
      "completions/max_terminated_length": 120.0,
      "completions/mean_length": 60.15625,
      "completions/mean_terminated_length": 60.15625,
      "completions/min_length": 18.8,
      "completions/min_terminated_length": 18.8,
      "epoch": 0.07331893737759253,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.8031459665155363e-07,
      "loss": -0.001,
      "num_tokens": 8487649.0,
      "reward": 1.4223046898841858,
      "reward_std": 0.0001302093267440796,
      "rewards/combined_reward/mean": 1.4223046898841858,
      "rewards/combined_reward/std": 0.2848698660731316,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 77.8,
      "completions/max_terminated_length": 77.8,
      "completions/mean_length": 45.84375,
      "completions/mean_terminated_length": 45.84375,
      "completions/min_length": 18.4,
      "completions/min_terminated_length": 18.4,
      "epoch": 0.07432330638276503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.796529918024196e-07,
      "loss": 0.0,
      "num_tokens": 8603284.0,
      "reward": 1.37947918176651,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.37947918176651,
      "rewards/combined_reward/std": 0.27231944501399996,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 204.9,
      "completions/max_terminated_length": 204.9,
      "completions/mean_length": 76.9375,
      "completions/mean_terminated_length": 76.9375,
      "completions/min_length": 18.3,
      "completions/min_terminated_length": 18.3,
      "epoch": 0.07532767538793753,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.7898170846072592e-07,
      "loss": 0.0009,
      "num_tokens": 8718758.0,
      "reward": 1.32010418176651,
      "reward_std": 0.002500000596046448,
      "rewards/combined_reward/mean": 1.32010418176651,
      "rewards/combined_reward/std": 0.34439257588237526,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 64.11875,
      "completions/mean_terminated_length": 64.11875,
      "completions/min_length": 16.4,
      "completions/min_terminated_length": 16.4,
      "epoch": 0.07633204439311003,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0,
      "learning_rate": 1.783008281929106e-07,
      "loss": -0.0051,
      "num_tokens": 8833993.0,
      "reward": 1.3178860425949097,
      "reward_std": 0.016688717156648637,
      "rewards/combined_reward/mean": 1.3178860425949097,
      "rewards/combined_reward/std": 0.3388564258813858,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 122.4,
      "completions/max_terminated_length": 122.4,
      "completions/mean_length": 62.99375,
      "completions/mean_terminated_length": 62.99375,
      "completions/min_length": 21.2,
      "completions/min_terminated_length": 21.2,
      "epoch": 0.07733641339828252,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 1.1234172582626343,
      "learning_rate": 1.7761043373151713e-07,
      "loss": -0.0046,
      "num_tokens": 8950896.0,
      "reward": 1.3376388788223266,
      "reward_std": 0.00034722290001809597,
      "rewards/combined_reward/mean": 1.3376388788223266,
      "rewards/combined_reward/std": 0.34661323949694633,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 111.0,
      "completions/max_terminated_length": 111.0,
      "completions/mean_length": 56.3,
      "completions/mean_terminated_length": 56.3,
      "completions/min_length": 20.3,
      "completions/min_terminated_length": 20.3,
      "epoch": 0.07834078240345503,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.7691060896514168e-07,
      "loss": -0.0003,
      "num_tokens": 9071600.0,
      "reward": 1.3996267199516297,
      "reward_std": 0.002080751396715641,
      "rewards/combined_reward/mean": 1.3996267199516297,
      "rewards/combined_reward/std": 0.26108508543111386,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 79.4,
      "completions/max_terminated_length": 79.4,
      "completions/mean_length": 45.76875,
      "completions/mean_terminated_length": 45.76875,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.07934515140862752,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.7620143892823975e-07,
      "loss": -0.0062,
      "num_tokens": 9174599.0,
      "reward": 1.378697919845581,
      "reward_std": 0.0003125001909211278,
      "rewards/combined_reward/mean": 1.378697919845581,
      "rewards/combined_reward/std": 0.2739857309497893,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 98.3,
      "completions/max_terminated_length": 98.3,
      "completions/mean_length": 50.98125,
      "completions/mean_terminated_length": 50.98125,
      "completions/min_length": 19.2,
      "completions/min_terminated_length": 19.2,
      "epoch": 0.08034952041380003,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.7548300979079413e-07,
      "loss": -0.0008,
      "num_tokens": 9284796.0,
      "reward": 1.368190097808838,
      "reward_std": 0.004609373956918716,
      "rewards/combined_reward/mean": 1.368190097808838,
      "rewards/combined_reward/std": 0.25843119765631856,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 80.0,
      "completions/max_terminated_length": 80.0,
      "completions/mean_length": 41.175,
      "completions/mean_terminated_length": 41.175,
      "completions/min_length": 12.8,
      "completions/min_terminated_length": 12.8,
      "epoch": 0.08135388941897254,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.7475540884784422e-07,
      "loss": 0.0,
      "num_tokens": 9398356.0,
      "reward": 1.2378819465637207,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2378819465637207,
      "rewards/combined_reward/std": 0.3914600659161806,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 96.3,
      "completions/max_terminated_length": 96.3,
      "completions/mean_length": 54.50625,
      "completions/mean_terminated_length": 54.50625,
      "completions/min_length": 19.6,
      "completions/min_terminated_length": 19.6,
      "epoch": 0.08235825842414503,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.7401872450887915e-07,
      "loss": -0.0007,
      "num_tokens": 9497821.0,
      "reward": 1.3947187542915345,
      "reward_std": 0.0015624999767169356,
      "rewards/combined_reward/mean": 1.3947187542915345,
      "rewards/combined_reward/std": 0.2990885377395898,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 96.7,
      "completions/max_terminated_length": 96.7,
      "completions/mean_length": 49.1875,
      "completions/mean_terminated_length": 49.1875,
      "completions/min_length": 17.9,
      "completions/min_terminated_length": 17.9,
      "epoch": 0.08336262742931753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.7327304628709528e-07,
      "loss": 0.0,
      "num_tokens": 9641355.0,
      "reward": 1.3011458396911622,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3011458396911622,
      "rewards/combined_reward/std": 0.2698082665912807,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 99.3,
      "completions/max_terminated_length": 99.3,
      "completions/mean_length": 54.9125,
      "completions/mean_terminated_length": 54.9125,
      "completions/min_length": 21.7,
      "completions/min_terminated_length": 21.7,
      "epoch": 0.08436699643449003,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.7251846478851951e-07,
      "loss": 0.0083,
      "num_tokens": 9759969.0,
      "reward": 1.2925694584846497,
      "reward_std": 0.0019245008006691933,
      "rewards/combined_reward/mean": 1.2925694584846497,
      "rewards/combined_reward/std": 0.26882885694503783,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 168.1,
      "completions/max_terminated_length": 168.1,
      "completions/mean_length": 66.68125,
      "completions/mean_terminated_length": 66.68125,
      "completions/min_length": 19.3,
      "completions/min_terminated_length": 19.3,
      "epoch": 0.08537136543966253,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 6.147635459899902,
      "learning_rate": 1.7175507170100008e-07,
      "loss": -0.0077,
      "num_tokens": 9881310.0,
      "reward": 1.2720364809036255,
      "reward_std": 0.011238560592755676,
      "rewards/combined_reward/mean": 1.2720364809036255,
      "rewards/combined_reward/std": 0.31835093796253205,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 91.0,
      "completions/max_terminated_length": 91.0,
      "completions/mean_length": 47.25,
      "completions/mean_terminated_length": 47.25,
      "completions/min_length": 23.2,
      "completions/min_terminated_length": 23.2,
      "epoch": 0.08637573444483503,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 1.287226676940918,
      "learning_rate": 1.7098295978306552e-07,
      "loss": -0.012,
      "num_tokens": 9981046.0,
      "reward": 1.322606337070465,
      "reward_std": 0.0022470591589808463,
      "rewards/combined_reward/mean": 1.322606337070465,
      "rewards/combined_reward/std": 0.3106359137222171,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 91.9,
      "completions/max_terminated_length": 91.9,
      "completions/mean_length": 46.50625,
      "completions/mean_terminated_length": 46.50625,
      "completions/min_length": 13.9,
      "completions/min_terminated_length": 13.9,
      "epoch": 0.08738010345000753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.7020222285265395e-07,
      "loss": 0.0,
      "num_tokens": 10089371.0,
      "reward": 1.2643750071525575,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2643750071525575,
      "rewards/combined_reward/std": 0.4044176399707794,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 217.1,
      "completions/max_terminated_length": 217.1,
      "completions/mean_length": 70.81875,
      "completions/mean_terminated_length": 70.81875,
      "completions/min_length": 18.9,
      "completions/min_terminated_length": 18.9,
      "epoch": 0.08838447245518004,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.6941295577571328e-07,
      "loss": 0.0079,
      "num_tokens": 10197254.0,
      "reward": 1.309374988079071,
      "reward_std": 0.002500000596046448,
      "rewards/combined_reward/mean": 1.309374988079071,
      "rewards/combined_reward/std": 0.325995758920908,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 94.7,
      "completions/max_terminated_length": 94.7,
      "completions/mean_length": 53.04375,
      "completions/mean_terminated_length": 53.04375,
      "completions/min_length": 22.5,
      "completions/min_terminated_length": 22.5,
      "epoch": 0.08938884146035253,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.686152544546743e-07,
      "loss": 0.0008,
      "num_tokens": 10316525.0,
      "reward": 1.3464062690734864,
      "reward_std": 0.00416666641831398,
      "rewards/combined_reward/mean": 1.3464062690734864,
      "rewards/combined_reward/std": 0.2880703628063202,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 97.0,
      "completions/max_terminated_length": 97.0,
      "completions/mean_length": 50.95625,
      "completions/mean_terminated_length": 50.95625,
      "completions/min_length": 16.8,
      "completions/min_terminated_length": 16.8,
      "epoch": 0.09039321046552504,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.6780921581679763e-07,
      "loss": 0.0021,
      "num_tokens": 10435242.0,
      "reward": 1.2726041793823242,
      "reward_std": 0.009523502597585321,
      "rewards/combined_reward/mean": 1.2726041793823242,
      "rewards/combined_reward/std": 0.33535852897912266,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 104.1,
      "completions/max_terminated_length": 104.1,
      "completions/mean_length": 57.20625,
      "completions/mean_terminated_length": 57.20625,
      "completions/min_length": 19.4,
      "completions/min_terminated_length": 19.4,
      "epoch": 0.09139757947069753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.6699493780239649e-07,
      "loss": 0.0,
      "num_tokens": 10548043.0,
      "reward": 1.3535937666893005,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3535937666893005,
      "rewards/combined_reward/std": 0.33704030215740205,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 107.5,
      "completions/max_terminated_length": 107.5,
      "completions/mean_length": 52.25,
      "completions/mean_terminated_length": 52.25,
      "completions/min_length": 16.2,
      "completions/min_terminated_length": 16.2,
      "epoch": 0.09240194847587004,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.6617251935293588e-07,
      "loss": -0.0028,
      "num_tokens": 10675027.0,
      "reward": 1.3419270992279053,
      "reward_std": 0.0015625,
      "rewards/combined_reward/mean": 1.3419270992279053,
      "rewards/combined_reward/std": 0.32070667631924155,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 104.1,
      "completions/max_terminated_length": 104.1,
      "completions/mean_length": 58.05625,
      "completions/mean_terminated_length": 58.05625,
      "completions/min_length": 25.7,
      "completions/min_terminated_length": 25.7,
      "epoch": 0.09340631748104254,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.6534206039901054e-07,
      "loss": 0.0,
      "num_tokens": 10805048.0,
      "reward": 1.4538020730018615,
      "reward_std": 0.0005208343267440796,
      "rewards/combined_reward/mean": 1.4538020730018615,
      "rewards/combined_reward/std": 0.17151957787573338,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 79.8,
      "completions/max_terminated_length": 79.8,
      "completions/mean_length": 39.75,
      "completions/mean_terminated_length": 39.75,
      "completions/min_length": 12.6,
      "completions/min_terminated_length": 12.6,
      "epoch": 0.09441068648621503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.6450366184820256e-07,
      "loss": 0.0,
      "num_tokens": 10906272.0,
      "reward": 1.258458322286606,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.258458322286606,
      "rewards/combined_reward/std": 0.3260463088750839,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 118.2,
      "completions/max_terminated_length": 118.2,
      "completions/mean_length": 61.65625,
      "completions/mean_terminated_length": 61.65625,
      "completions/min_length": 22.5,
      "completions/min_terminated_length": 22.5,
      "epoch": 0.09541505549138754,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.6365742557282017e-07,
      "loss": 0.0091,
      "num_tokens": 11023301.0,
      "reward": 1.3930208325386046,
      "reward_std": 0.0050495008006691934,
      "rewards/combined_reward/mean": 1.3930208325386046,
      "rewards/combined_reward/std": 0.30010328590869906,
      "step": 950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 105.4,
      "completions/max_terminated_length": 105.4,
      "completions/mean_length": 55.79375,
      "completions/mean_terminated_length": 55.79375,
      "completions/min_length": 23.8,
      "completions/min_terminated_length": 23.8,
      "epoch": 0.09641942449656003,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.6280345439751956e-07,
      "loss": 0.0044,
      "num_tokens": 11148588.0,
      "reward": 1.3295885443687439,
      "reward_std": 0.024523502215743065,
      "rewards/combined_reward/mean": 1.3295885443687439,
      "rewards/combined_reward/std": 0.2928910902235657,
      "step": 960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 121.7,
      "completions/max_terminated_length": 121.7,
      "completions/mean_length": 57.56875,
      "completions/mean_terminated_length": 57.56875,
      "completions/min_length": 14.2,
      "completions/min_terminated_length": 14.2,
      "epoch": 0.09742379350173254,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.6194185208681082e-07,
      "loss": -0.0043,
      "num_tokens": 11268271.0,
      "reward": 1.2413020730018616,
      "reward_std": 0.005312500335276127,
      "rewards/combined_reward/mean": 1.2413020730018616,
      "rewards/combined_reward/std": 0.3525692358613014,
      "step": 970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 117.1,
      "completions/max_terminated_length": 117.1,
      "completions/mean_length": 57.45625,
      "completions/mean_terminated_length": 57.45625,
      "completions/min_length": 19.1,
      "completions/min_terminated_length": 19.1,
      "epoch": 0.09842816250690503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.610727233324495e-07,
      "loss": 0.0,
      "num_tokens": 11388376.0,
      "reward": 1.2743749976158143,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.2743749976158143,
      "rewards/combined_reward/std": 0.2959941983222961,
      "step": 980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 99.0,
      "completions/max_terminated_length": 99.0,
      "completions/mean_length": 51.1875,
      "completions/mean_terminated_length": 51.1875,
      "completions/min_length": 15.9,
      "completions/min_terminated_length": 15.9,
      "epoch": 0.09943253151207754,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.6019617374071597e-07,
      "loss": 0.0001,
      "num_tokens": 11503346.0,
      "reward": 1.3223437547683716,
      "reward_std": 0.0028867511078715324,
      "rewards/combined_reward/mean": 1.3223437547683716,
      "rewards/combined_reward/std": 0.37292833551764487,
      "step": 990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 146.8,
      "completions/max_terminated_length": 146.8,
      "completions/mean_length": 64.61875,
      "completions/mean_terminated_length": 64.61875,
      "completions/min_length": 23.9,
      "completions/min_terminated_length": 23.9,
      "epoch": 0.10043690051725004,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 1.5931230981958326e-07,
      "loss": 0.0,
      "num_tokens": 11600585.0,
      "reward": 1.3246874928474426,
      "reward_std": 0.0,
      "rewards/combined_reward/mean": 1.3246874928474426,
      "rewards/combined_reward/std": 0.23927139891311527,
      "step": 1000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 118.5,
      "completions/max_terminated_length": 118.5,
      "completions/mean_length": 65.5,
      "completions/mean_terminated_length": 65.5,
      "completions/min_length": 19.9,
      "completions/min_terminated_length": 19.9,
      "epoch": 0.10144126952242254,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.5842123896577543e-07,
      "loss": -0.0036,
      "num_tokens": 11737513.0,
      "reward": 1.4228541851043701,
      "reward_std": 0.001154701132327318,
      "rewards/combined_reward/mean": 1.4228541851043701,
      "rewards/combined_reward/std": 0.25313766626641154,
      "step": 1010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01875,
      "completions/max_length": 311.5,
      "completions/max_terminated_length": 224.3,
      "completions/mean_length": 90.28125,
      "completions/mean_terminated_length": 54.49903869628906,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.10244563852759504,
      "frac_reward_zero_std": 0.925,
      "grad_norm": 0.0,
      "learning_rate": 1.5752306945171818e-07,
      "loss": -0.0115,
      "num_tokens": 11875626.0,
      "reward": 1.2103593707084657,
      "reward_std": 0.004468750953674316,
      "rewards/combined_reward/mean": 1.2103593707084657,
      "rewards/combined_reward/std": 0.40379793345928194,
      "step": 1020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 128.9,
      "completions/max_terminated_length": 128.9,
      "completions/mean_length": 59.56875,
      "completions/mean_terminated_length": 59.56875,
      "completions/min_length": 15.4,
      "completions/min_terminated_length": 15.4,
      "epoch": 0.10345000753276754,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.5661791041238254e-07,
      "loss": 0.0054,
      "num_tokens": 11995581.0,
      "reward": 1.3099791407585144,
      "reward_std": 0.00020833313465118408,
      "rewards/combined_reward/mean": 1.3099791407585144,
      "rewards/combined_reward/std": 0.33452749061398207,
      "step": 1030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01875,
      "completions/max_length": 343.2,
      "completions/max_terminated_length": 228.1,
      "completions/mean_length": 114.825,
      "completions/mean_terminated_length": 78.1860580444336,
      "completions/min_length": 25.7,
      "completions/min_terminated_length": 25.7,
      "epoch": 0.10445437653794004,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.5570587183202433e-07,
      "loss": -0.0099,
      "num_tokens": 12114797.0,
      "reward": 1.2818815290927887,
      "reward_std": 0.0018619796261191367,
      "rewards/combined_reward/mean": 1.2818815290927887,
      "rewards/combined_reward/std": 0.31765228807926177,
      "step": 1040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 113.8,
      "completions/max_terminated_length": 113.8,
      "completions/mean_length": 55.68125,
      "completions/mean_terminated_length": 55.68125,
      "completions/min_length": 16.7,
      "completions/min_terminated_length": 16.7,
      "epoch": 0.10545874554311253,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.5478706453082016e-07,
      "loss": -0.0016,
      "num_tokens": 12246978.0,
      "reward": 1.3307923913002013,
      "reward_std": 0.0002604176523163915,
      "rewards/combined_reward/mean": 1.3307923913002013,
      "rewards/combined_reward/std": 0.3518651008605957,
      "step": 1050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 144.1,
      "completions/max_terminated_length": 144.1,
      "completions/mean_length": 69.0,
      "completions/mean_terminated_length": 69.0,
      "completions/min_length": 17.6,
      "completions/min_terminated_length": 17.6,
      "epoch": 0.10646311454828504,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.5386160015140167e-07,
      "loss": 0.0061,
      "num_tokens": 12363690.0,
      "reward": 1.3816666841506957,
      "reward_std": 0.00692450013011694,
      "rewards/combined_reward/mean": 1.3816666841506957,
      "rewards/combined_reward/std": 0.2784981057047844,
      "step": 1060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 94.3,
      "completions/max_terminated_length": 94.3,
      "completions/mean_length": 49.63125,
      "completions/mean_terminated_length": 49.63125,
      "completions/min_length": 13.9,
      "completions/min_terminated_length": 13.9,
      "epoch": 0.10746748355345755,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.5292959114529024e-07,
      "loss": 0.0011,
      "num_tokens": 12481815.0,
      "reward": 1.3338541746139527,
      "reward_std": 0.002886752039194107,
      "rewards/combined_reward/mean": 1.3338541746139527,
      "rewards/combined_reward/std": 0.3240374196320772,
      "step": 1070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 97.0,
      "completions/max_terminated_length": 97.0,
      "completions/mean_length": 49.3375,
      "completions/mean_terminated_length": 49.3375,
      "completions/min_length": 19.6,
      "completions/min_terminated_length": 19.6,
      "epoch": 0.10847185255863004,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.5199115075923323e-07,
      "loss": -0.0008,
      "num_tokens": 12604637.0,
      "reward": 1.2796875,
      "reward_std": 0.0003608435858041048,
      "rewards/combined_reward/mean": 1.2796875,
      "rewards/combined_reward/std": 0.3038814663887024,
      "step": 1080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 313.5,
      "completions/max_terminated_length": 115.4,
      "completions/mean_length": 112.05,
      "completions/mean_terminated_length": 61.88333358764648,
      "completions/min_length": 20.3,
      "completions/min_terminated_length": 20.3,
      "epoch": 0.10947622156380254,
      "frac_reward_zero_std": 0.975,
      "grad_norm": 0.0,
      "learning_rate": 1.5104639302144326e-07,
      "loss": 0.0052,
      "num_tokens": 12735697.0,
      "reward": 1.342291682958603,
      "reward_std": 0.0007216888945549727,
      "rewards/combined_reward/mean": 1.342291682958603,
      "rewards/combined_reward/std": 0.31657470017671585,
      "step": 1090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.9,
      "completions/max_terminated_length": 127.9,
      "completions/mean_length": 61.70625,
      "completions/mean_terminated_length": 61.70625,
      "completions/min_length": 17.8,
      "completions/min_terminated_length": 17.8,
      "epoch": 0.11048059056897504,
      "frac_reward_zero_std": 0.95,
      "grad_norm": 0.0,
      "learning_rate": 1.5009543272774323e-07,
      "loss": 0.0029,
      "num_tokens": 12842590.0,
      "reward": 1.3991406440734864,
      "reward_std": 0.000572918844409287,
      "rewards/combined_reward/mean": 1.3991406440734864,
      "rewards/combined_reward/std": 0.27981497598811983,
      "step": 1100
    }
  ],
  "logging_steps": 10,
  "max_steps": 3000,
  "num_input_tokens_seen": 12842590,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}